我想解析一個網站。我正在使用HTMLParser模塊。問題是我想解析第<a href="">
評論後:<!-- /topOfPage -->
,但我真的不知道該怎麼做。所以我在文檔中發現有一個叫做handle_comment
的函數,但我還沒有發現如何正確使用它。我有以下幾種:html解析器python
import HTMLParser
class LinkFinder(HTMLParser.HTMLParser):
def __init__(self, *args, **kwargs):
# Can't use super() - HTMLParser is an old-style class
HTMLParser.HTMLParser.__init__(self, *args, **kwargs)
self.in_linktag = False
self.url_cache = []
def handle_comment(self,data):
if data == "topOfPage":
print data
def handle_starttag(self, tag, attrs):
if tag == "a" and any("href" == t[0] for t in attrs): # found link
self.in_linktag = True
self.url_cache.append([dict(attrs)['href']])
def handle_endtag(self, tag):
if tag == "a" and self.in_linktag: # ignore '<a name=""...'
self.in_linktag = False
def handle_data(self, data):
if self.in_linktag:
self.url_cache[-1].append(data)
TESTDATA = """
< html>
< body>
< div>
< ul>
< !-- /topOfPage -->
<tr>
< td class="empty-cell-left"> </td>
< td class="image">
< a href="http://test" rel="nofollow">
< ul>
< /div>
< /body>
< /html>
"""
def main():
lf = LinkFinder()
lf.feed(TESTDATA)
lf.close()
print lf.url_cache
if __name__ == "__main__":
main()
怎麼辦?
非常感謝你! – user1010775