2013-07-11 11 views
0

我想確定最簡單的方法來記錄從原始網頁鏈接的網頁鏈接的網頁內容。我希望我的輸出成爲一個表格,其中的行與第三層深度頁面的內容相對應。如何刮除超過Python的三重鏈接列表鏈接的第一個實例?

正如你可以從代碼中看到的,我目前只能得到第三級頁所需的項目上的第一個實例。另外,雖然我當前的代碼將返回一行對應於基本URL上的每個h2項,但我希望每個h2項具有多行(與「span.'case-doc-details」a的實例一樣多第二層)。

一些額外的信息:在每一個連接的狀態,我不知道有多少頁將被鏈接。我正在使用Python和Scraperwiki,兩者都是新的。我試圖研究這個問題,但是在我知道要問什麼的問題上遇到了障礙。預先感謝您的幫助。

import scraperwiki 
import urlparse 
import lxml.html 
import urllib 

def scrape_table(root): 
    rows = root.cssselect("h2") 
    record = {} 
    counter=0 
    for row in rows: 
     table_cells = row.cssselect("h2 a") 
     for cell in table_cells: 
      record['Count']=counter 
      table_cellsurls = table_cells[0].cssselect("a") 
      record['CaseURL'] = table_cellsurls[0].attrib.get('href') 
      caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read() 

      #print caselinkurl 
      caseroots = lxml.html.fromstring(caselinkurl) 
      title=caseroots.cssselect("title") 
      record['Title'] = title[0].text_content() 
      ids=caseroots.cssselect("div div div div a") 
      for i in ids: 
       if len(ids)<=2: 
        record['Rules']="None" 
        record['Treaty']="None" 
       else: 
        record['Rules']=ids[2].text_content() 
        record['Treaty']=ids[3].text_content() 
      pars = caseroots.cssselect("span.'case-doc-details' a") 
      #print "pars length is", len(pars) 
      caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read() 
      caseroots2=lxml.html.fromstring(caselinkurl2) 
      #create another table element with rows, marked off with the case that they came from, create all the rows. 
      for i in pars:  
       if len(pars)==0: 
        record['DetailsURL']="None" 
       else:      
        record['DetailsURL']=pars[0].attrib.get('href') 
       pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'") 
       if len(pars2)==0: 
        record['Doc Date']="None" 
       else:       
        record['Doc Date']=pars2[0].text_content() 
       pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a") 
       if len(pars3) ==0: 
        record['Doc Type Link']="None" 
        record['Doc Type']="None" 
       else: 
        record['Doc Type Link']=pars3[0].attrib.get('href') 
        record['Doc Type']=pars3[0].text_content() 
       pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'") 
       if len(pars4)==0: 
        record['Claimant Nominee']="None" 
       else: 
        record['Claimant Nominee']=pars4[0].text_content() 
       pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'") 
       if len(pars5)==0: 
        record['Respondent Nominee']="None" 
       else: 
        record['Respondent Nominee']=pars5[0].text_content() 
       pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'") 
       if len(pars6)==0: 
        record['President']="None" 
       else: 
        record['President']=pars6[0].text_content() 

      print record, '------------' 
      scraperwiki.sqlite.save(['Count'],record) 
      counter+=1 
def scrape_and_look_for_next_link(url): 
    html = scraperwiki.scrape(url) 
    print html 
    root = lxml.html.fromstring(html) 
    scrape_table(root) 


#START HERE: 
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All' 
scrape_and_look_for_next_link(url) 

回答

0

這裏是到目前爲止,我已經得到了代碼 - 這還沒有搶到的文件鏈接數據(或保存任何東西),但應該是在這裏延伸原則納入其他功能的情況下:

import scraperwiki 
import urlparse 
import lxml.html 
import urllib 

def scrape_page(linkurl): 
    html = scraperwiki.scrape(linkurl) 
    root = lxml.html.fromstring(html) 
    title = root.cssselect("h1") 
    print "the title:", title[0].text 
    record = {} 
    record['title'] = title[0].text 
    record['url'] = linkurl 
    #<div class="field-items"><div class="field-item even"><a 
    arbrules = root.cssselect("div.field-items a") 
    if arbrules: 
     record['arbruleurl'] = arbrules[0].attrib.get("href") 
     record['arbrule'] = arbrules[0].text_content() 
    else: 
     record['arbruleurl'] = "NO URL" 
     record['arbrule'] = "NO ARBRULE" 
    legalbasis = root.cssselect("div.field-label") 
    if legalbasis: 
     record['legalbasis'] = legalbasis[0].text_content() 
    else: 
     record['legalbasis'] = "NO LEGAL BASIS GIVEN" 
    extralinks = [] 
    contents = root.cssselect("div.view-content a") 
    if contents: 
     for content in contents: 
      extralinks.append(content.text_content()) 
      extralinks.append(content.attrib.get("href")) 
     record['extralinks'] = extralinks 
    else: 
     record['extralinks'] = "NO EXTRA LINKS" 
    #record['firstparty'] = title[0].text.split(" v. ")[0] 
    #record['secondparty'] = title[0].text.split(" v. ")[1] 
    #record['casenumber'] = title[0].text.split(" Case No.")[1] 
    print record 


def scrape_table(root): 
    links = root.cssselect("div.link-wrapper a") 
    for link in links: 
     print link.text_content() 
     linkurl = link.attrib.get("href") 
     print linkurl 
     scrape_page('http://www.italaw.com'+linkurl) 

def scrape_and_look_for_next_link(url): 
    html = scraperwiki.scrape(url) 
    print html 
    root = lxml.html.fromstring(html) 
    scrape_table(root) 


#START HERE: 
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All' 
scrape_and_look_for_next_link(url) 
+0

謝謝!這是超級有用的。我最終得到了一種稍微不同的工作方式。我會張貼。 – toddntucker

0

這是我得到了什麼,爲這個問題的工作。

一些有益的一般要點:

  1. 使用的if else循環從非零長度的關鍵屬性的長度爲零的情況區分開來。

  2. 在此之前,創建您的字典。

  3. 在循環的if和else組件中,給出打印,存儲和索引增加指令。在進入循環之前,您會將索引設置爲零。
  4. 在else位中,創建一個for循環,它遍歷每個實例i,並使用它們想要遍歷集合的鍵屬性來記錄第i個實例。將所有其他屬性設置爲第零個實例。
  5. 最後,在處理任意數量的三重嵌套鏈接時,通常最好從最低級別刮取所有數據(如果可能)。在我的情況下,這是有效的,因爲我想要記錄的所有屬性在這個級別上重複。在其他情況下,我不確定最好的處理方式是什麼。

感謝保羅輕推這個前進。

import scraperwiki 
import urlparse 
import lxml.html 
import urllib 

def scrape_table(root): 
    rows = root.cssselect("h2") 
    counter=0 
    for row in rows: 
     table_cells = row.cssselect("h2 a") 
     for cell in table_cells: 
      table_cellsurls = table_cells[0].cssselect("a") 
      #record['CaseURL'] = table_cellsurls[0].attrib.get('href') 
      caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read() 
      #print caselinkurl 
      caseroots = lxml.html.fromstring(caselinkurl) 
      pars = caseroots.cssselect("span.'case-doc-details' a") 
      #print "pars length is", len(pars) 
      record = {} 
      #create another table element with rows, marked off with the case that they came from, create all the rows. 
      if len(pars)==0: 
       record['DetailsURL']="None" 
       record['Count']=counter 
       print record, '------------' 
       scraperwiki.sqlite.save(['Count'],record) 
       counter+=1 
      else: 
       for i in range(0,len(pars)):      
        record['Count']=counter 
        caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[i].attrib.get('href')).read() 
        caseroots2=lxml.html.fromstring(caselinkurl2) 
        record['DetailsURL']=pars[i].attrib.get('href') 
        title=caseroots2.cssselect("h2") 
        record['Title'] = title[1].text_content() 
        rules=caseroots2.cssselect("div.'field-name-field-arbitration-rules'") 
        if len(rules)==0: 
         record['Rules']="None" 
        else: 
         record['Rules']=rules[0].text_content() 
        treaty=caseroots2.cssselect("div.'field-name-field-case-treaties'") 
        if len(treaty)==0: 
         record['Treaty']="None"          
        else: 
         record['Treaty']=treaty[0].text_content() 
        pars2=caseroots2.cssselect("div.'field-name-field-case-document-date'") 
        if len(pars2)==0: 
         record['Doc Date']="None" 
        else:       
         record['Doc Date']=pars2[0].text_content() 
        pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a") 
        if len(pars3) ==0: 
         record['Doc Type Link']="None" 
         record['Doc Type']="None" 
        else: 
         record['Doc Type Link']=pars3[0].attrib.get('href') 
         record['Doc Type']=pars3[0].text_content() 
        pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'") 
        if len(pars4)==0: 
         record['Claimant Nominee']="None" 
        else: 
         record['Claimant Nominee']=pars4[0].text_content() 
        pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'") 
        if len(pars5)==0: 
         record['Respondent Nominee']="None" 
        else: 
         record['Respondent Nominee']=pars5[0].text_content() 
        pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'") 
        if len(pars6)==0: 
         record['President']="None" 
        else: 
         record['President']=pars6[0].text_content() 

        print record, '------------' 
        scraperwiki.sqlite.save(['Count'],record) 
        counter+=1 
def scrape_and_look_for_next_link(url): 
    html = scraperwiki.scrape(url) 
    print html 
    root = lxml.html.fromstring(html) 
    scrape_table(root) 


#START HERE: 
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All' 
scrape_and_look_for_next_link(url)