2010-10-25 59 views
1

這是我第一次使用SAXParser(我在Android中使用它,但我認爲這對這個特定問題沒有什麼不同),我正在嘗試閱讀來自RSS提要的數據。到目前爲止,它在大多數情況下都非常適合我,但在遇到包含HTML編碼文本的標記時(例如&lt;a href="http://...),我遇到了麻煩。 characters()方法只能在&lt;中作爲<讀取,然後將下一組字符視爲單獨的實體,而不是一次取得整個內容。我寧願它只是直接讀取它,而不實際翻譯HTML。我用我的文檔處理程序(縮短)的代碼貼在下面:使用SAXParser從XML中檢索HTML編碼文本

@Override 
    public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { 
     if (localName.equalsIgnoreCase("channel")) { 
      inChannel = true; 
     } 
     if (inChannel) { 
      if (newFeed == null) newFeed = new Feed(); 

      if (localName.equalsIgnoreCase("image")) { 
       if (feedImage == null) feedImage = new Image(); 
       inImage = true; 
      } 

      if (localName.equalsIgnoreCase("item")) { 
       if (newItem == null) newItem = new Item(); 
       if (itemList == null) itemList = new ArrayList<Item>(); 
       inItem = true; 
      } 
     } 
    } 

    @Override 
    public void endElement(String uri, String localName, String qName) throws SAXException { 
     if(!inItem) { 
      if(!inImage) { 
       if(inChannel) { 
        //Reached end of feed 
        if(localName.equalsIgnoreCase("channel")) { 
         newFeed.setItems((ArrayList<Item>)itemList); 
         finalFeed = newFeed; 
         newFeed = null;      
         inChannel = false; 
         return; 
        } else if(localName.equalsIgnoreCase("title")) { 
         newFeed.setTitle(currentValue); return; 
        } else if(localName.equalsIgnoreCase("link")) { 
         newFeed.setLink(currentValue); return; 
        } else if(localName.equalsIgnoreCase("description")) { 
         newFeed.setDescription(currentValue); return; 
        } else if(localName.equalsIgnoreCase("language")) { 
         newFeed.setLanguage(currentValue); return; 
        } else if(localName.equalsIgnoreCase("copyright")) { 
         newFeed.setCopyright(currentValue); return; 
        } else if(localName.equalsIgnoreCase("category")) { 
         newFeed.addCategory(currentValue); return; 
        }      
       } 
      } 
      else { //is inImage 
       //finished with feed image 
       if(localName.equalsIgnoreCase("image")) { 
        newFeed.setImage(feedImage); 
        feedImage = null; 
        inImage = false; 
        return; 
       } else if (localName.equalsIgnoreCase("url")) { 
        feedImage.setUrl(currentValue); return; 
       } else if (localName.equalsIgnoreCase("title")) { 
        feedImage.setTitle(currentValue); return; 
       } else if (localName.equalsIgnoreCase("link")) { 
        feedImage.setLink(currentValue); return; 
       } 
      } 
     } 
     else { //is inItem 
      //finished with news item 
      if (localName.equalsIgnoreCase("item")) { 
       itemList.add(newItem); 
       newItem = null; 
       inItem = false; 
       return; 
      } else if (localName.equalsIgnoreCase("title")) { 
       newItem.setTitle(currentValue); return; 
      } else if (localName.equalsIgnoreCase("link")) { 
       newItem.setLink(currentValue); return; 
      } else if (localName.equalsIgnoreCase("description")) { 
       newItem.setDescription(currentValue); return; 
      } else if (localName.equalsIgnoreCase("author")) { 
       newItem.setAuthor(currentValue); return; 
      } else if (localName.equalsIgnoreCase("category")) { 
       newItem.addCategory(currentValue); return; 
      } else if (localName.equalsIgnoreCase("comments")) { 
       newItem.setComments(currentValue); return; 
      } /*else if (localName.equalsIgnoreCase("enclosure")) { 
       To be implemented later 
      }*/ else if (localName.equalsIgnoreCase("guid")) { 
       newItem.setGuid(currentValue); return; 
      } else if (localName.equalsIgnoreCase("pubDate")) { 
       newItem.setPubDate(currentValue); return; 
      }   
     } 
    } 

    @Override 
    public void characters(char[] ch, int start, int length) { 
     currentValue = new String(ch, start, length); 
    } 

而RSS提要我試圖解析的一個例子是this one

任何想法?

回答

1

萬一它可以幫助任何人,我可以通過對每個我對數據感興趣的領域使用布爾值來解決此問題。然後我繼續追加到一個StringBuilder,直到我到達一個結束標記,之後我接受了StringBuilder的值,然後清空它,並將我的布爾值設置爲false。

@Override 
    public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { 
     sb.delete(0, sb.length()); 
     if (localName.equalsIgnoreCase("channel")) { 
      inChannel = true; 
      newFeed = new Feed(); 
      itemList = new ArrayList<Item>(); 
     } 
     if (inChannel) {    
      if (localName.equalsIgnoreCase("image")) { 
       feedImage = new Image(); 
       inImage = true; 
       return; 
      }   
      else if (localName.equalsIgnoreCase("item")) { 
       newItem = new Item(); 
       inItem = true; 
       return; 
      } 

      if(inImage) { //set booleans for image elements 
       if (localName.equalsIgnoreCase("title")) imgTitle = true; 
       else if (localName.equalsIgnoreCase("link")) imgLink = true; 
       else if (localName.equalsIgnoreCase("url")) imgURL = true; 
       return; 
      }   
      else if(inItem) { //set booleans for item elements 
       if (localName.equalsIgnoreCase("title")) iTitle = true; 
       else if (localName.equalsIgnoreCase("link")) iLink = true; 
       else if (localName.equalsIgnoreCase("description")) iDescription = true; 
       else if (localName.equalsIgnoreCase("author")) iAuthor = true; 
       else if (localName.equalsIgnoreCase("category")) iCategory = true; 
       else if (localName.equalsIgnoreCase("comments")) iComments = true; 
       else if (localName.equalsIgnoreCase("guid")) iGuid = true; 
       else if (localName.equalsIgnoreCase("pubdate")) iPubDate= true; 
       else if (localName.equalsIgnoreCase("source")) iSource = true; 
       return; 
      } else { //set booleans for channel elements 
       if (localName.equalsIgnoreCase("title")) fTitle = true; 
       else if (localName.equalsIgnoreCase("link")) fLink = true; 
       else if (localName.equalsIgnoreCase("description")) fDescription = true; 
       else if (localName.equalsIgnoreCase("language")) fLanguage= true; 
       else if (localName.equalsIgnoreCase("copyright")) fCopyright = true; 
       else if (localName.equalsIgnoreCase("category")) fCategory = true; 
       return; 
      } 
     }  
    } 

    @Override 
    public void endElement(String uri, String localName, String qName) throws SAXException { 
     if(inChannel) { 
      if(inImage) { 
       if (localName.equalsIgnoreCase("title")) { 
        feedImage.setTitle(sb.toString()); 
        sb.delete(0, sb.length()); 
        imgTitle = false; 
        return; 
       } 
       else if (localName.equalsIgnoreCase("link")) { 
        feedImage.setLink(sb.toString()); 
        sb.delete(0, sb.length()); 
        imgLink = false; 
        return; 
       } 
       else if (localName.equalsIgnoreCase("url")) { 
        feedImage.setUrl(sb.toString()); 
        sb.delete(0, sb.length()); 
        imgURL = false; 
        return; 
       } 
       else return; 
      } 
      else if(inItem) { 
       if (localName.equalsIgnoreCase("item")) { 
        itemList.add(newItem); 
        newItem = null; 
        inItem = false; 
        return; 
       } else if (localName.equalsIgnoreCase("title")) { 
        newItem.setTitle(sb.toString()); 
        sb.delete(0, sb.length()); 
        iTitle = false; 
        return; 
       } else if (localName.equalsIgnoreCase("link")) { 
        newItem.setLink(sb.toString()); 
        sb.delete(0, sb.length()); 
        iLink = false; 
        return; 
       } else if (localName.equalsIgnoreCase("description")) { 
        newItem.setDescription(sb.toString()); 
        sb.delete(0, sb.length()); 
        iDescription = false; 
        return; 
       } else if (localName.equalsIgnoreCase("author")) { 
        newItem.setAuthor(sb.toString()); 
        sb.delete(0, sb.length()); 
        iAuthor = false; 
        return; 
       } else if (localName.equalsIgnoreCase("category")) { 
        newItem.addCategory(sb.toString()); 
        sb.delete(0, sb.length()); 
        iCategory = false; 
        return; 
       } else if (localName.equalsIgnoreCase("comments")) { 
        newItem.setComments(sb.toString()); 
        sb.delete(0, sb.length()); 
        iComments = false; 
        return; 
       } /*else if (localName.equalsIgnoreCase("enclosure")) { 
        To be implemented later 
       }*/ else if (localName.equalsIgnoreCase("guid")) { 
        newItem.setGuid(sb.toString()); 
        sb.delete(0, sb.length()); 
        iGuid = false; 
        return; 
       } else if (localName.equalsIgnoreCase("pubDate")) { 
        newItem.setPubDate(sb.toString()); 
        sb.delete(0, sb.length()); 
        iPubDate = false; 
        return; 
       } 
      } 
      else { 
       if(localName.equalsIgnoreCase("channel")) { 
        newFeed.setItems((ArrayList<Item>)itemList); 
        finalFeed = newFeed; 
        newFeed = null;      
        inChannel = false; 
        return; 
       } else if(localName.equalsIgnoreCase("title")) { 
        newFeed.setTitle(currentValue); 
        sb.delete(0, sb.length()); 
        fTitle = false; 
        return; 
       } else if(localName.equalsIgnoreCase("link")) { 
        newFeed.setLink(currentValue); 
        sb.delete(0, sb.length()); 
        fLink = false; 
        return; 
       } else if(localName.equalsIgnoreCase("description")) { 
        newFeed.setDescription(sb.toString()); 
        sb.delete(0, sb.length()); 
        fDescription = false; 
        return; 
       } else if(localName.equalsIgnoreCase("language")) { 
        newFeed.setLanguage(currentValue); 
        sb.delete(0, sb.length()); 
        fLanguage = false; 
        return; 
       } else if(localName.equalsIgnoreCase("copyright")) { 
        newFeed.setCopyright(currentValue); 
        sb.delete(0, sb.length()); 
        fCopyright = false; 
        return; 
       } else if(localName.equalsIgnoreCase("category")) { 
        newFeed.addCategory(currentValue); 
        sb.delete(0, sb.length()); 
        fCategory = false; 
        return; 
       } 
      } 
     } 
    } 

    @Override 
    public void characters(char[] ch, int start, int length) { 
     sb.append(new String(ch, start, length)); 
    } 
+0

是的,這就是薩克斯分析器的工作原理 – Falmarri 2010-10-29 06:14:51

+0

顯然我發現了。 – kcoppock 2010-10-29 11:52:45

0

這樣的特殊字符被包含在CDATA標記中。你需要看到它們被保存了,然後SAX Parser可以正確處理它們。

4

很好。這個解決方案讓我困惑了一些,並且我無法像你那樣獲得localName的值,但是我仍然能夠使StringBuilder方法起作用。

我沒有方法代替:

public void characters(char[] ch, int start, int length) throws SAXException {

tempVal = new String(ch,start,length); 但不是添加以下行的方法:

tempSB = tempSB.append(new String(ch, start, length)); 

凡tempSB是一個StringBuilder對象。 這意味着我不需要改變我的整個解析器,並可以在需要時切換到讀取SB。 當我來到包含HTML中的startElement元素,我用:

tempSB.delete(0, tempSB.length()); 

而且在我的endElement使用:

tempText.setText(tempSB.toString()) ; 

這麼簡單。在我的情況下,沒有複雜的布爾系統需要,也不需要訪問localName,這是一個概念,避開了我。我似乎很好地訪問qName。

非常感謝kcoppock發佈您找到的解決方案。我一直在尋找幾個小時,這是我能找到簡潔明瞭的唯一文章。我正在做的任務非常緊迫,如果沒有你的幫助,我會失敗的。

+0

很高興它能幫助你!感謝您解釋您的改進。 :)祝你的項目好運。 – kcoppock 2011-04-20 12:29:43

+0

謝謝。這也適用於我! – 2013-01-29 19:50:45