我想確定最簡單的方法來記錄從原始網頁鏈接的網頁鏈接的網頁內容。我希望我的輸出成爲一個表格,其中的行與第三層深度頁面的內容相對應。如何刮除超過Python的三重鏈接列表鏈接的第一個實例?
正如你可以從代碼中看到的,我目前只能得到第三級頁所需的項目上的第一個實例。另外,雖然我當前的代碼將返回一行對應於基本URL上的每個h2項,但我希望每個h2項具有多行(與「span.'case-doc-details」a的實例一樣多第二層)。
一些額外的信息:在每一個連接的狀態,我不知道有多少頁將被鏈接。我正在使用Python和Scraperwiki,兩者都是新的。我試圖研究這個問題,但是在我知道要問什麼的問題上遇到了障礙。預先感謝您的幫助。
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
record = {}
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
record['Count']=counter
table_cellsurls = table_cells[0].cssselect("a")
record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
title=caseroots.cssselect("title")
record['Title'] = title[0].text_content()
ids=caseroots.cssselect("div div div div a")
for i in ids:
if len(ids)<=2:
record['Rules']="None"
record['Treaty']="None"
else:
record['Rules']=ids[2].text_content()
record['Treaty']=ids[3].text_content()
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
#create another table element with rows, marked off with the case that they came from, create all the rows.
for i in pars:
if len(pars)==0:
record['DetailsURL']="None"
else:
record['DetailsURL']=pars[0].attrib.get('href')
pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
謝謝!這是超級有用的。我最終得到了一種稍微不同的工作方式。我會張貼。 – toddntucker