2015-09-28 65 views
-1

只有第一個結果正在寫入csv,每行包含一個url的一個字母。這不是所有的URL都被寫入,每行一個。Python - 爲什麼這個數據被寫入文件不正確?

我在這段代碼的最後一部分沒有做什麼,導致只用結果之一而不是所有結果寫入cvs?

import requests 
from bs4 import BeautifulSoup 
import csv 

def grab_listings(): 
    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/2/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/3/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/4/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/5/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/6/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/7/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/8/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/9/") 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class":"wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class":"frame"}): 
     return elem["href"] 

l = grab_listings() 


with open ("gyms.csv", "wb") as file: 
     writer = csv.writer(file) 
     for row in l: 
      writer.writerow(row) 
+2

你在第一次返回後將不會獲得更多的數據。 –

+2

你的函數在你第一次返回任何東西時停止執行,在第一個循環中 –

+4

你有兩個問題:1.'return'只能發生每個功能一次(你可能想要閱讀發電機和'yield');和2.因此,你因此迭代*單個字符串*並將每個字符傳遞給'writerow'。結果是不可避免的。 – jonrsharpe

回答

0

簡體:

import requests 
from bs4 import BeautifulSoup 
import csv 


def grab_listings(): 
    for i in range(0, 5): 
     url = "http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/{}/" 

     r = requests.get(url.format(i + 1)) 
     soup = BeautifulSoup(r.text, 'html.parser') 
     l_area = soup.find("div", {"class": "wlt_search_results"}) 

     for elem in l_area.findAll("a", {"class": "frame"}): 
      yield elem["href"] 

l = grab_listings() 


with open("gyms.csv", "w") as file: 
    writer = csv.writer(file) 
    for row in l: 
     writer.writerow(row) 
+1

這當然是代碼上的一個重大改進,但是我懷疑這對於OP來說更有用,如果你解釋了你已經改變了什麼,以及爲什麼**。 – jonrsharpe

+0

這段代碼與原來的代碼不一樣..你說得對,它可以被簡化,但第一個案例沒有附加頁面,其他所有代碼從2到9,而不是從1到5 - 薩洛51分鐘前 – Salo

+0

謝謝你的解答。我實際上使用過你的方法,因爲它顯示了一行中的每個URL,這正是我所尋找的。如上所述,每個字母用逗號分隔。 你能告訴我如何修改你的代碼,以便每個字母的逗號不會打破結果嗎? – McLeodx

1

所以我重構你的代碼了一下,我認爲它應該工作,你會想到它現在:

import requests 
from bs4 import BeautifulSoup 
import csv 


def grab_listings(page_idx): 
    ret = [] 
    url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/" 
      "page/{}/").format(page_idx) # the index of the page will be inserted here 
    r = requests.get(url) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    l_area = soup.find("div", {"class": "wlt_search_results"}) 
    for elem in l_area.findAll("a", {"class": "frame"}): 
     # be sure to add all your results to a list and return it, 
     # if you return here then you will only get the first result 
     ret.append(elem["href"]) 
    return ret 


def main(): 
    l = [] # this will be a list of lists 
    # call the function 9 times here with idx from 1 till 9 
    for page_idx in range(1, 10): 
     l.append(grab_listings(page_idx)) 
    print l 

    with open("gyms.csv", "wb") as f: 
     writer = csv.writer(f) 
     for row in l: 
      # be sure that your row is a list here, if it is only 
      # a string all characters will be seperated by a comma. 
      writer.writerow(row) 

# for writing each URL in one line separated by commas at the end 
# with open("gyms.csv", "wb") as f: 
#  for row in l: 
#   string_to_write = ',\n'.join(row) 
#   f.write(string_to_write) 

if __name__ == '__main__': 
    main() 

我添加了一些意見的代碼並希望它足夠說明問題。如果不是隻是要求:)

+0

謝謝Salo,這些解釋特別有用!當我使用這段代碼時,雖然它創建了一個有9行的cvs,並且數據存儲爲,每頁有一行url。 這將如何修改,使每一行有一個網址? (即96行) – McLeodx

+1

不客氣@McLeodx。這更容易,因爲您不需要使用csv模塊。您可以更改代碼以提供一維列表並加入它。利用現有的代碼,可以像更新後的答案中那樣替換開放部分。 – Salo