我知道這是一個遲到的回答,但如果你可以隨時使用sanitize-html它爲節點編寫的,但可以肯定,你可以運行browserify對庫(或者你的代碼)。
請注意,它使用lodash,所以如果您已經在使用它,那麼您可能需要調整包裝。
這個例子比你想要的要多......我使用這個庫來清理輸入代碼,從這裏轉換爲存儲在db中的markdown,我通過marked重新水合。
// convert/html-to-filtered-markdown.js
'use strict';
var sanitize = require('sanitize-html') //https://www.npmjs.org/package/sanitize-html
,toMarkdown = require('to-markdown').toMarkdown
;
module.exports = function convertHtmlToFilteredMarkdown(input, options) {
if (!input) return '';
options = options || {};
//basic cleanup, normalize line endings, normalize/reduce whitespace and extra line endings
var response = (input || '').toString().trim()
.replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings
.replace(/「/g, '"') //remove fancy quotes
.replace(/」/g, '"') //remove fancy quotes
.replace(/‘/g, '\'') //remove fancy quotes
.replace(/’/g, '\'') //remove fancy quotes
;
//sanitize html input
response = sanitize(response, {
//don't allow table elements
allowedTags: [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol', 'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div', 'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre' ],
//make orderd lists
transformTags: {
'ol': 'ul'
}
}).replace(/\r\n|\r|\n/g,'\n') //normalize line endings;
if (!options.tables) {
response = response.replace(/[\s\n]*\<(\/?)(table|thead|tbody|tr|th|td)\>[\s\n]*/g, '\n\n') //replace divs/tables blocks as paragraphs
}
//cleanup input further
response = response
.replace(/[\s\n]*\<(\/?)(div|p)\>[\s\n]*/g, '\n\n') //divs and p's to simple multi-line expressions
.replace(/\>#/g, '\n\n#') //cleanup #'s' after closing tag, ex: <a>...</a>\n\n# will be reduced via sanitizer
.replace(/\\s+\</,'<') //remove space before a tag open
.replace(/\>\s+\n?/,'>\n') //remove space after a tag close
.replace(/\&?nbsp\;?/g,' ') //revert nbsp to space
.replace(/\<\h[12]/g,'<h3').replace(/\<\/\h[12]/g,'</h3') //reduce h1/h2 to h3
;
//convert response to markdown
response = toMarkdown(response);
//normalize line endings
response = response
.replace(/(?:^|\n)##?[\b\s]/g,'\n### ') //reduce h1 and h2 to h3
.replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings
.trim()
return response + '\n';
}
首先,謝謝!這樣可行。我唯一的問題是否可以在一個replace()中完成。 HTML文件可能非常大,效率是關鍵。我會盡情玩。 – thechriskelley 2010-11-03 22:00:40
@thechriskelley:在一個「替換」中增加了一個解決方案 – thejh 2010-11-03 22:58:45
非常好,謝謝! – thechriskelley 2010-11-04 21:53:56