2016-09-15 27 views
2

我試圖抓取table,但我碰到了一點障礙。我想確保每個標題下面的數據(例如Cert Issued (30))與相應的標題分組在一起。當行標籤沒有嵌套在標題標籤下時,在表標題下方的信息行中颳去一行信息

當我嘗試使用下面的html時出現問題。

<tr> 
         <td>       
          <table class="EPSBResultGrid" cellspacing="0" rules="cols" border="1" style="border-color:DarkGray;border-collapse:collapse;"> 
    <tbody><tr class="EPSBResultGridHeader"> 
     <th scope="col">Cred</th><th scope="col">Description</th><th scope="col">Effective</th><th scope="col">Expiration</th><th scope="col">Restricted To</th> 
    </tr><tr class="EPSBResultGridHeader"> 
     <td colspan="9" style="border-width:1px;border-style:solid;font-weight:bold;">Do Not Print (00)</td> 
    </tr><tr class="EPSBResultGridItem"> 
     <td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_CRED_CODE">RANK1</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_CRED_DESC">Rank I</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl03$EFF_DATE_txtDateMM">07<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl03_EFF_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl03_EFF_DATE_txtDateMM" value="07"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl03$EFF_DATE_txtDateDD">01<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl03_EFF_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl03_EFF_DATE_txtDateDD" value="01"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl03$EFF_DATE_txtDateYYYY">2000<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl03_EFF_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl03_EFF_DATE_txtDateYYYY" value="2000"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl01" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl02" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl03" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl04" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl04" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl03$EXP_DATE_txtDateMM">06<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl03_EXP_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl03_EXP_DATE_txtDateMM" value="06"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl03$EXP_DATE_txtDateDD">30<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl03_EXP_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl03_EXP_DATE_txtDateDD" value="30"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl03$EXP_DATE_txtDateYYYY">2020<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl03_EXP_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl03_EXP_DATE_txtDateYYYY" value="2020"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl06" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl07" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl08" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl09" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl03_ctl09" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl03_ORG_NAME"></span></td> 
    </tr><tr class="EPSBResultGridHeader"> 
     <td colspan="9" style="border-width:1px;border-style:solid;font-weight:bold;">Cert Issued (30)</td> 
    </tr><tr class="EPSBResultGridAlternatingItem"> 
     <td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_CRED_CODE">G20</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_CRED_DESC">Middle School Teaching Field: Social Studies</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl05$EFF_DATE_txtDateMM">07<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl05_EFF_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl05_EFF_DATE_txtDateMM" value="07"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl05$EFF_DATE_txtDateDD">01<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl05_EFF_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl05_EFF_DATE_txtDateDD" value="01"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl05$EFF_DATE_txtDateYYYY">1995<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl05_EFF_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl05_EFF_DATE_txtDateYYYY" value="1995"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl01" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl02" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl03" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl04" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl04" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl05$EXP_DATE_txtDateMM">06<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl05_EXP_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl05_EXP_DATE_txtDateMM" value="06"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl05$EXP_DATE_txtDateDD">30<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl05_EXP_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl05_EXP_DATE_txtDateDD" value="30"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl05$EXP_DATE_txtDateYYYY">2020<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl05_EXP_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl05_EXP_DATE_txtDateYYYY" value="2020"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl06" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl07" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl08" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl09" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl05_ctl09" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl05_ORG_NAME"></span></td> 
    </tr><tr class="EPSBResultGridItem"> 
     <td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_CRED_CODE">G71</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_CRED_DESC">Middle School Teaching Field: Mathematics</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl06$EFF_DATE_txtDateMM">07<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl06_EFF_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl06_EFF_DATE_txtDateMM" value="07"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl06$EFF_DATE_txtDateDD">01<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl06_EFF_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl06_EFF_DATE_txtDateDD" value="01"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl06$EFF_DATE_txtDateYYYY">1995<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl06_EFF_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl06_EFF_DATE_txtDateYYYY" value="1995"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl01" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl02" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl03" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl04" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl04" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl06$EXP_DATE_txtDateMM">06<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl06_EXP_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl06_EXP_DATE_txtDateMM" value="06"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl06$EXP_DATE_txtDateDD">30<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl06_EXP_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl06_EXP_DATE_txtDateDD" value="30"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl06$EXP_DATE_txtDateYYYY">2020<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl06_EXP_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl06_EXP_DATE_txtDateYYYY" value="2020"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl06" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl07" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl08" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl09" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl06_ctl09" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl06_ORG_NAME"></span></td> 
    </tr><tr class="EPSBResultGridAlternatingItem"> 
     <td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_CRED_CODE">PCS</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_CRED_DESC">Provisional Certificate For Guidance Counselor, Secondary Grades 5-12</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl07$EFF_DATE_txtDateMM">01<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl07_EFF_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl07_EFF_DATE_txtDateMM" value="01"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl07$EFF_DATE_txtDateDD">01<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl07_EFF_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl07_EFF_DATE_txtDateDD" value="01"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl07$EFF_DATE_txtDateYYYY">2016<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl07_EFF_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl07_EFF_DATE_txtDateYYYY" value="2016"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl01" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl02" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl03" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl04" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl04" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl07$EXP_DATE_txtDateMM">06<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl07_EXP_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl07_EXP_DATE_txtDateMM" value="06"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl07$EXP_DATE_txtDateDD">30<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl07_EXP_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl07_EXP_DATE_txtDateDD" value="30"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl07$EXP_DATE_txtDateYYYY">2020<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl07_EXP_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl07_EXP_DATE_txtDateYYYY" value="2020"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl06" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl07" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl08" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl09" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl07_ctl09" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl07_ORG_NAME"></span></td> 
    </tr><tr class="EPSBResultGridItem"> 
     <td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_CRED_CODE">PMBF</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_CRED_DESC">Provisional Certificate For Teaching In The Middle Grades 5-8 (And For Other Assignments As Identified By Kentucky Program Of Studies)</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl08$EFF_DATE_txtDateMM">07<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl08_EFF_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl08_EFF_DATE_txtDateMM" value="07"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl08$EFF_DATE_txtDateDD">01<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl08_EFF_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl08_EFF_DATE_txtDateDD" value="01"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl08$EFF_DATE_txtDateYYYY">2015<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl08_EFF_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl08_EFF_DATE_txtDateYYYY" value="2015"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl01" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl02" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl03" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl04" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl04" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><nobr><span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl08$EXP_DATE_txtDateMM">06<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl08_EXP_DATE_txtDateMM" name="ctl00_ContentPlaceHolder1_ctl00_ctl08_EXP_DATE_txtDateMM" value="06"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl08$EXP_DATE_txtDateDD">30<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl08_EXP_DATE_txtDateDD" name="ctl00_ContentPlaceHolder1_ctl00_ctl08_EXP_DATE_txtDateDD" value="30"></span>-<span class="" id="ctl00$ContentPlaceHolder1$ctl00$ctl08$EXP_DATE_txtDateYYYY">2020<input type="hidden" id="ctl00_ContentPlaceHolder1_ctl00_ctl08_EXP_DATE_txtDateYYYY" name="ctl00_ContentPlaceHolder1_ctl00_ctl08_EXP_DATE_txtDateYYYY" value="2020"></span></nobr><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl06" style="color:Red;display:none;">You must enter a day.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl07" style="color:Red;display:none;">You must enter a month.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl08" style="color:Red;display:none;">You must enter a year.</span><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl09" evaluationfunction="cb_verifydate_ctl00_ContentPlaceHolder1_ctl00_ctl08_ctl09" style="color:Red;visibility:hidden;">Invalid Date.</span></td><td><span id="ctl00_ContentPlaceHolder1_ctl00_ctl08_ORG_NAME"></span></td> 
    </tr> 
</tbody></table><br><span><span>Note: Suspended and revoked credentials are shown with red text with a strike through line.</span></span> 
         </td> 
        </tr> 

的資訊(class="EPSBResultGridItem" & class="EPSBResultGridAlternatingItem")標題下方(class="EPSBResultHeader")沒有嵌套在它們下面,正如我一直無法找到一種方法來確保每個標題下的信息分組結果與正確的標題。

這是我的代碼:

count = 0 
header = tree.xpath(
    './/table/tr[@class="EPSBResultGridHeader"]') 
difference = 10 - len(header) 
for i in range(0, difference): 
    header.append('') 

for license_row in header: 
    count = count + 1 

    try: 
     header_data = license_row.xpath(".//text()") 
     header_data = clean(header_data) 

     nested_data = license_row.xpath(".//following-sibling::tr//text()") 
     nested_data = clean(nested_data) 
     print count, header_data 
     print count, nested_data 

    except AttributeError: 
     header_data = '' 

    # Append licensure data 
    if count == 1: 
     lheader1.append(header_data) 
     lheader_info1(nested_data) 
    if count == 2: 
     lheader2.append(header_data) 
     lheader_info2(nested_data) 
    if count == 3: 
     lheader3.append(header_data) 
     lheader_info3(nested_data) 
    if count == 4: 
     lheader4.append(header_data) 
     lheader_info4(nested_data) 
    if count == 5: 
     lheader5.append(header_data) 
     lheader_info5(nested_data) 

我的最終目標是有一個這樣的輸出:

>>>print lheader_info2 
['RANK1', 'Rank I', '07-01-2018', '06-30-2021'] 

>>>print lheader_info3 
['G20', 'Middle School Teaching Field: Social Studies----', 'G30', 'Middle School Teaching Field: English And Communications----', 'ILE2', 'Professional Certificate For Instructional Leadership -- Early Elementary School Principal, Grades K-4; Level II', '07-01-2017', '06-30-2021', 'ILM2', 'Professional Certificate For Instructional Leadership--Middle Grade School Principal, Grades 5-8; Level II', '07-01-2017', '06-30-2021', 'ILV2', 'Professional Certificate For Instructional Leadership--Supervisor Of Instruction, Grades K-12; Level II', '07-01-2018', '06-30-2021', 'PMBF', 'Provisional Certificate For Teaching In The Middle Grades 5-8 (And For Other Assignments As Identified By Kentucky Program Of Studies)', '07-01-2016', '06-30-2021'] 

我使用lxml,但我也用BeautifulSoup如果這似乎是一個更好的方法來做到這一點。

回答

2

我會找到每個子頭和迭代next tr siblings破一次另一頭被滿足的循環,或達到表的末尾:

from collections import defaultdict 
from pprint import pprint 

import requests 
from bs4 import BeautifulSoup 

url = "https://wd.kyepsb.net/EPSB.WebApps/KECI/view_data.aspx?id=37161" 
response = requests.get(url) 
soup = BeautifulSoup(response.content, "html.parser") 

data = defaultdict(list) 

table = soup.find("table", class_="EPSBResultGrid") 
for header in table.select("tr.EPSBResultGridHeader")[1:]: 
    header_name = header.get_text(strip=True) 
    for row in header.find_next_siblings("tr"): 
     if "EPSBResultGridHeader" in row.get("class", []): 
      break 

     data[header_name].append(row.td.get_text(strip=True)) 

pprint(dict(data)) 

打印:

{'Cert Issued (30)': ['G20', 'G30', 'ILE2', 'ILM2', 'ILV2', 'PMBF'], 
'Do Not Print (00)': ['RANK1'], 
'History (97)': ['ILE2', 'ILM2', 'ILV2', 'RANK1']} 

[1:]片這裏是跳過最初的頂級表頭。

+1

我想你想'數據[HEADER_NAME] .extend([td.get_text(strip = True)for td in row.find_all(「td」)])' –

+1

@PadraicCunningham對於所有的單元格,是的,只是想給出如何對這些行進行「分組」的樣本。謝謝! – alecxe

+1

@PadraicCunningham我很猶豫是否將你的答案命名爲接受的答案,因爲他們都很優秀,但我最終使用了alecxe's。我學會了更多的努力去理解你的答案,而不是我的代碼在一個多星期內擺弄過。再次感謝! – otteheng

2

使用LXML

def pair(): 
    tree = html.fromstring(requests.get(url).content) 
    # get table and iterate over the trs 
    iter_trs = tree.cssselect("table.EPSBResultGrid")[0].iter("tr") 
    # skip the initial tr 
    next(iter_trs) 
    # first EPSBResultGridHeader 
    start = next(iter_trs).xpath("td//text()")[0] 
    nodes, tmp = {}, [] 
    # iterate over the rest of the nodes 
    for node in iter_trs: 
     # if we find another EPSBResultGridHeader, yield what we have and start again. 
     if node.get("class") == "EPSBResultGridHeader": 
      nodes[start] = tmp 
      start, tmp = node.xpath("td//text()")[0], [] 
     else: 
      tmp.append([td.xpath("normalize-space(.)").replace(ur"\xa0", "") for td in node.xpath("./td")]) 
    return nodes 

這將使你:

{'Cert Issued (30)': [[u'G20', 
         u'Middle School Teaching Field: Social Studies', 
         u'--', 
         u'--', 
         u''], 
         [u'G30', 
         u'Middle School Teaching Field: English And Communications', 
         u'--', 
         u'--', 
         u''], 
         [u'ILE2', 
         u'Professional Certificate For Instructional Leadership -- Early Elementary School Principal, Grades K-4; Level II', 
         u'07-01-2017', 
         u'06-30-2021', 
         u''], 
         [u'ILM2', 
         u'Professional Certificate For Instructional Leadership--Middle Grade School Principal, Grades 5-8; Level II', 
         u'07-01-2017', 
         u'06-30-2021', 
         u''], 
         [u'ILV2', 
         u'Professional Certificate For Instructional Leadership--Supervisor Of Instruction, Grades K-12; Level II', 
         u'07-01-2018', 
         u'06-30-2021', 
         u''], 
         [u'PMBF', 
         u'Provisional Certificate For Teaching In The Middle Grades 5-8 (And For Other Assignments As Identified By Kentucky Program Of Studies)', 
         u'07-01-2016', 
         u'06-30-2021', 
         u'']], 
'Do Not Print (00)': [[u'RANK1', 
         u'Rank I', 
         u'07-01-2018', 
         u'06-30-2021', 
         u'']], 
'History (97)': [[u'ILE2', 
        u'Professional Certificate For Instructional Leadership -- Early Elementary School Principal, Grades K-4; Level II', 
        u'07-01-2012', 
        u'06-30-2017', 
        u''], 
        [u'ILM2', 
        u'Professional Certificate For Instructional Leadership--Middle Grade School Principal, Grades 5-8; Level II', 
        u'07-01-2012', 
        u'06-30-2017', 
        u''], 
        [u'ILV2', 
        u'Professional Certificate For Instructional Leadership--Supervisor Of Instruction, Grades K-12; Level II', 
        u'07-01-2013', 
        u'06-30-2018', 
        u''], 
        [u'RANK1', u'Rank I', u'12-15-1995', u'06-30-2018', u'']]} 

如果你想平列表使用擴展tmp.extend(td.xpath("normalize-space(.)").replace(u"\xa0", "") for td in node.xpath("./td")):

{'Cert Issued (30)': [u'G20', 
         u'Middle School Teaching Field: Social Studies', 
         u'--', 
         u'--', 
         u'', 
         u'G30', 
         u'Middle School Teaching Field: English And Communications', 
         u'--', 
         u'--', 
         u'', 
         u'ILE2', 
         u'Professional Certificate For Instructional Leadership -- Early Elementary School Principal, Grades K-4; Level II', 
         u'07-01-2017', 
         u'06-30-2021', 
         u'', 
         u'ILM2', 
         u'Professional Certificate For Instructional Leadership--Middle Grade School Principal, Grades 5-8; Level II', 
         u'07-01-2017', 
         u'06-30-2021', 
         u'', 
         u'ILV2', 
         u'Professional Certificate For Instructional Leadership--Supervisor Of Instruction, Grades K-12; Level II', 
         u'07-01-2018', 
         u'06-30-2021', 
         u'', 
         u'PMBF', 
         u'Provisional Certificate For Teaching In The Middle Grades 5-8 (And For Other Assignments As Identified By Kentucky Program Of Studies)', 
         u'07-01-2016', 
         u'06-30-2021', 
         u''], 
'Do Not Print (00)': [u'RANK1', u'Rank I', u'07-01-2018', u'06-30-2021', u''], 
'History (97)': [u'ILE2', 
        u'Professional Certificate For Instructional Leadership -- Early Elementary School Principal, Grades K-4; Level II', 
        u'07-01-2012', 
        u'06-30-2017', 
        u'', 
        u'ILM2', 
        u'Professional Certificate For Instructional Leadership--Middle Grade School Principal, Grades 5-8; Level II', 
        u'07-01-2012', 
        u'06-30-2017', 
        u'', 
        u'ILV2', 
        u'Professional Certificate For Instructional Leadership--Supervisor Of Instruction, Grades K-12; Level II', 
        u'07-01-2013', 
        u'06-30-2018', 
        u'', 
        u'RANK1', 
        u'Rank I', 
        u'12-15-1995', 
        u'06-30-2018', 
        u'']}