通過JavaScript在文檔中檢測希伯來文單詞

當談到網站開發時（雖然不是針對一般編程），我是一個大多數新手，因此赦免任何不正確的術語。通過JavaScript在文檔中檢測希伯來文單詞

我想要構建一個腳本，當它添加到HTML頁面中時，它會檢測頁面中的每個希伯來單詞並將該單詞轉換爲HTML元素，例如，轉換爲標題的超鏈接。

因此，以下內容：

<p>ראש הלשכה</p>

轉化爲：

<p><a title="word 1" href="#">הלשכה</a> <a title="word 2" href="#">ראש</a></p>

有意義嗎？

因此，我想業務的第一順序是檢測頁面中的希伯來文單詞。我將如何去做這個？我不知道從哪裏開始，除了圍繞jQuery文檔。

來源

2010-02-09 Judah Himango

HTML是Unicode嗎？ UTF-8，還是它可以是任何編碼？ – 2010-02-09 02:58:19

好問題。讓我們簡單說一下UTF-8。我喜歡它在像http://www.haaretz.co.il – 2010-02-09 03:03:58

在字符串中搜索希伯來字很簡單。使用符合希伯來碼點的連續序列的正則表達式：

/[\u05D0-\u05FF]+/

由於JS支持函數式編程，我們可以很容易地編寫自己的功能，行走文檔樹，調用每個文本節點上的功能。首先，一些腳手架。

if (! window.assert) { 
    window.dbgLvl = 1; // change this to 0 for production release 
    window.assert=function(succeeded, msg) { 
     if (dbgLvl && !succeeded) { 
      if (!msg) msg = 'assertion failed'; 
      throw msg; 
     } 
    } 
}

接下來，我們定義一個方法將字符串拆分爲數組，包括輸出中的分隔符。

/* String.separate is like String.split, but the result includes the 
    separators. 

    These implementations of 'String.separate' will work for our purposes, 
    but are buggy in general, due to differences in the implementation of 
    String.split. 

    The two misbehaviors we correct are including neither grouped patterns 
    nor empty strings in the result, though the latter is only corrected 
    when the missing empty string is at the start or the end. 
*/ 
if ('-'.split(/(-)/).length & 1) { 
    assert('a'.split(/a/).length, 'split includes grouping but not empty strings'); 
    // split includes groups in result 
    String.prototype.separate = function (separator) { 
     if (typeof separator == 'string') { 
      if (separator.charAt(0) != '(' 
       || separator.charAt(separator.length-1) != ')') 
      { 
       separator = new RegExp('(' + separator + ')', 'g'); 
      } else { 
       separator = new RegExp(separator, 'g'); 
      } 
     } 
     return this.split(separator); 
    } 
} else { 
    if ('a'.split(/a/).length) { 
     // empty strings included, grouped aren't 
     String.prototype.separate = function (separator) { 
      if (typeof separator == 'string') { 
       separator = new RegExp(separator, 'g'); 
      } 
      var fence = this.match(separator); 
      if (!fence) { 
       return [this]; 
      } 
      var posts = this.split(separator); 
      assert(posts.length = fence.length+1); 
      var result = [], i; 
      for (i=0; i<fence.length; ++i) { 
       result.push(posts[i]); 
       result.push(fence[i]); 
      } 
      result.push(posts[i]); 
      return result; 
     } 
    } else { 
     // neither empty strings nor groups are included. IE, you suck. 
     String.prototype.separate = function (separator) { 
      if (typeof separator == 'string') { 
       separator = new RegExp(separator, 'g'); 
      } 
      var fence = this.match(separator); 
      if (!fence) { 
       return [this]; 
      } 
      var posts = this.split(separator); 
      if (posts.length <= fence.length) { 
       /* missing some posts. Assume that they are the first or 
        last, though this won't be true in general. 
       */ 
       if (posts.length < fence.length) { 
        posts.unshift(''); 
        posts.push(''); 
       } else { 
        if (this.substring(0, fence[0].length) == fence[0]) { 
         posts.unshift(''); 
        } else { 
         posts.push(''); 
        } 
       } 
      } 
      var result = [], i; 
      for (i=0; i<fence.length; ++i) { 
       result.push(posts[i]); 
       result.push(fence[i]); 
      } 
      result.push(posts[i]); 
      return result; 
     } 
    } 
}

接下來是一些節點謂詞。

if (! window.Node) { 
    window.Node={TEXT_NODE: 3}; 
} else if (typeof Node.TEXT_NODE == 'undefined') { 
    Node.TEXT_NODE = 3; 
} 

function isTextNode(node) {return node.nodeType == Node.TEXT_NODE;} 
function hasKids(node) {return node.childNodes && node.childNodes.length;} 
function allNodes(node) {return true;}

現在的功能走DOM。

/* 
    forEachChild: pre-order traversal of document tree. Applies a function to some nodes, determined by the 'which' and 'descendInto' arguments. 

Arguments: 
    which (function): Returns true if 'action' should be applied to a node. 
    action (function): Takes a node and does something to it. 
    parent (Node): The node to start from. 
    descendInto (function, optional): By default, forEachChild will descend into every child that itself has children. Place additional restrictions by passing this argument. 
*/ 
var forEachChild = (function() { 
     /* the actual implementation is made a local function so that the 
      optional parameter can be handled efficiently. 
     */ 
     function _forEachChild(which, action, node, descendInto) { 
      for (var child=node.firstChild; child; child=child.nextSibling) { 
       if (which(child)) { 
        action(child); 
       } 
       if (hasKids(child) && descendInto(child)) { 
        _forEachChild(which, action, child, descendInto); 
       } 
      } 
     } 
     return function (which, action, node, descendInto) { 
      if (!descendInto) {descendInto=allNodes} 
      _forEachChild(which, action, node, descendInto); 
     } 
    })(); 

function forEachNode(which, action, descendInto) { 
    return forEachChild(which, action, document, descendInto); 
} 

function forEachTextNode(action, descendInto) { 
    return forEachNode(isTextNode, action, descendInto); 
} 

function forEachTextNodeInBody(action, descendInto) { 
    return forEachChild(isTextNode, action, document.body, descendInto); 
}

最後一組函數將文本節點中的文本替換爲與您選擇的新節點匹配模式的文本。這個組（好吧，由wrapText返回的函數）還沒有經過完全的跨瀏覽器兼容性測試，包括它是否正確處理文本方向。

/* 
    wrapText replaces substrings in a text node with new nodes. 

Arguments: 
    pattern (RegExp || string): If a RegExp, must be of the form: '/(...)/g'. 
    replace (function): Takes a string and returns a Node to replace the string. 

Returns a function that takes a text node. 
*/ 
function wrapText(pattern, replace) { 
    return function (node) { 
     var chunks = node.nodeValue.separate(pattern); 
     if (chunks.length < 2) 
      return; 
     var wordCount=0; 
     var fragment = document.createDocumentFragment(); 
     var i; 
     // don't bother adding first chunk if it's empty. 
     if (chunks[0].length) { 
      fragment.appendChild(document.createTextNode(chunks[0])); 
     } 
     for (i=1; i < chunks.length; i+=2) { 
      fragment.appendChild(replace(chunks[i])); // † 
      fragment.appendChild(document.createTextNode(chunks[i+1])); // ‡ 
     } 
     // clean-up 
     assert(i == chunks.length, 'even number of chunks in ['+chunks+'] when it should be odd.'); 
     /* chunks.length and i will always be odd, thus i == chunks.length 
     * when the loop finishes. This means the last element is never 
     * missed. 
     * Here's another way of thinking about this. Since the last 
     * (and first) chunk won't match the pattern, it won't be 
     * processed by the line †. The penultimate chunk, however, does 
     * match. Assuming the loop condition is correct,the penultimate 
     * chunk must be processed by †, hence the last chunk is 
     * processed by ‡. 
     */ 
     if (! chunks[i-1].length) { 
      // last chunk is empty; remove it. 
      fragment.removeChild(fragment.lastChild); 
     } 
     node.parentNode.replaceChild(fragment, node); 
    } 
} 

/* 
    createAnchorWrap wraps a string in an anchor node. createAnchorWrap also 
    sets the title of the anchor. 

Arguments: 
    title (string || function, optional): The title for the anchor element. 
     If title is a function, it's called with the string to wrap. If 
     title is a string, wrapper will use a word counter for the title 
     function. 

Returns a function that takes a string and returns an anchor element. 
*/ 
function createAnchorWrap(title) { 
    if (typeof title == 'string') { 
     title=createWordCounter(title); 
    } else if (!title) { 
     title=createWordCounter(); 
    } 
    return function(word) { 
     var a = document.createElement('a'); 
     a.title=title(word); 
     a.appendChild(document.createTextNode(word)); 
     return a; 
    } 
} 

/* 
    createWordCounter creates a word counter, which returns the number of 
    times it's been called (including the current call), prefixed by a string. 

Arguments: 
    pre (string, optional): prefix for return value. 

Returns a function that takes a string (ignored) and returns a string. 

*/ 
function createWordCounter(pre) { 
    var wordCount=0; 
    if (pre) { 
     pre = pre.replace(/ *$/, ' '); 
    } else { 
     pre = 'word '; 
    } 
    return function(text) { 
     return pre + wordCount; 
    } 
}

要做的最後一件事是在頁面底部（例如）載入處理程序或腳本中啓動進程。

forEachTextNodeInBody(wrapText(/([\u05D0-\u05FF]+)/g, 
           createAnchorWrap()));

如果你想改變的前綴稱號，createWordCounter(...)結果傳遞給createAnchorWrap。

來源

2010-02-09 03:11:56 outis

這樣的文件上工作好吧，那是一個開始。所以，Javascript內置了對RegEx的支持。好，很好。現在，關於在HTML文檔中查找文本的那一點...... – 2010-02-09 03:18:05

好的，所以現在您已經編寫了一些Javascript函數來遍歷樹。看起來我可以使用forEachTextNode（action）以某種方式將文本元素替換爲achor元素。好的。我會看看我能做什麼。感謝你目前的幫助。 – 2010-02-09 03:28:49

請注意，使用JS庫（jQuery，Prototype，MooTools ...）可能仍然是一個好主意。 – outis 2010-02-09 03:33:54

通過JavaScript在文檔中檢測希伯來文單詞

回答

相關問題