2016-01-22 68 views
8
from selenium import webdriver 
from selenium.webdriver.common.by import By 
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from bs4 import BeautifulSoup 
import urllib,requests,unidecode,lxml,pdb 
from pyvirtualdisplay import Display 
from xvfbwrapper import Xvfb 
class wait_for_more_than_n_elements_to_be_present(object): 
    def __init__(self, locator, count): 
     self.locator = locator 
     self.count = count 

    def __call__(self, driver): 
     try: 
      elements = EC._find_elements(driver, self.locator) 
      return len(elements) > self.count 
     except StaleElementReferenceException: 
      return False 

def return_html_code(url): 
    print url #added in edit 1 
    vdisplay =Xvfb() 
    vdisplay.start() 
    driver = webdriver.Firefox() 
    driver.maximize_window() 
    driver.get(url) 
    # initial wait for the tweets to load 
    wait = WebDriverWait(driver, 240) 
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) 
    # scroll down to the last tweet until there is no more tweets loaded 
    while True: 
     tweets = driver.find_elements_by_css_selector("li[data-item-id]") 
     print len(tweets) #added in edit 1 
     driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1]) 
     try: 
      wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) 
     except TimeoutException: 
      break 
    html_full_source=driver.page_source 
    driver.close() 
    vdisplay.stop() 
    html_full=return_html_code(url) 

輸出:無頭無端滾動硒

https://twitter.com/search?q=Error%20Check&src=typd&lang=en 
20 
39 
56 
74 

我有無盡模式無休止地滾動頁上面的代碼。但不知怎的,它似乎停止之前。 引用 - https://stackoverflow.com/a/31058403/3646408

編輯1:

$ phantomjs --version 
2.1.1 

在捉迷藏@alexce代碼它運行2顯示不同的輸出,日期檢查清楚,有更多的鳴叫:

https://twitter.com/search?q=Error%20Check&src=typd&lang=en 
20 
40 
59 
76 
95 
114 
133 
152 
171 
191 
211 
231 
249 
267 
Date of most old tweet: 12 Jan 2016 


https://twitter.com/search?q=Error%20Check&src=typd&lang=en 
20 
40 
59 
76 
95 
114 
133 
152 
171 
191 
211 
231 
249 
267 
287 
303 
317 
337 
356 
373 
388 
400 
418 
437 
457 
476 
492 
Date of most old tweet: 8 Jan 2016 

編輯2:

運行@ alexce代碼的更新版本。它顯示〜7000推文後的下面的錯誤。

Traceback (most recent call last): 
     File "twitter_script.py", line 82, in <module> 
     search_twitter('Alcoholics Anonymous') 
     File "twitter_script.py", line 76, in search_twitter 
     db_name=write_data_to_db(*get_twitter_data(query)) 
     File "twitter_script.py", line 24, in get_twitter_data 
     html_full=return_html_code(url) 
     File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 48, in return_html_code 
     html_full_source=driver.page_source 
     File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 464, in page_source 
     return self.execute(Command.GET_PAGE_SOURCE)['value'] 
     File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 199, in execute 
     response = self.command_executor.execute(driver_command, params) 
     File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 395, in execute 
     return self._request(command_info[0], url, body=data) 
     File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 463, in _request 
     resp = opener.open(request, timeout=self._timeout) 
     File "c:\Anaconda\lib\urllib2.py", line 431, in open 
     response = self._open(req, data) 
     File "c:\Anaconda\lib\urllib2.py", line 449, in _open 
     '_open', req) 
     File "c:\Anaconda\lib\urllib2.py", line 409, in _call_chain 
     result = func(*args) 
     File "c:\Anaconda\lib\urllib2.py", line 1227, in http_open 
     return self.do_open(httplib.HTTPConnection, req) 
     File "c:\Anaconda\lib\urllib2.py", line 1200, in do_open 
     r = h.getresponse(buffering=True) 
     File "c:\Anaconda\lib\httplib.py", line 1136, in getresponse 
     response.begin() 
     File "c:\Anaconda\lib\httplib.py", line 453, in begin 
     version, status, reason = self._read_status() 
     File "c:\Anaconda\lib\httplib.py", line 409, in _read_status 
     line = self.fp.readline(_MAXLINE + 1) 
     File "c:\Anaconda\lib\socket.py", line 480, in readline 
     data = self._sock.recv(self._rbufsize) 
    socket.error: [Errno 10054] An existing connection was forcibly closed by the remote host 

編輯3: 嘗試對不同的URL相同的代碼。

https://twitter.com/search?q=Alcoholics%20Anonymous%20Drunk%20since%3A2006-03-24%20until%3A2006-04-23&src=typd&lang=en 
Traceback (most recent call last): 
    File "twitter_script.py", line 64, in <module> 
    search_twitter('Alcoholics Anonymous Drunk') 
    File "twitter_script.py", line 58, in search_twitter 
    db_name=write_data_to_db(*get_twitter_data(query)) 
    File "twitter_script.py", line 31, in get_twitter_data 
    html_full=return_html_code(url) 
    File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 30, in return_html_code 
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) 
    File "c:\Anaconda\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until 
    raise TimeoutException(message, screen, stacktrace) 
selenium.common.exceptions.TimeoutException: Message: 
Screenshot: available via screen 

編輯4:

[email protected]:~/social_network_extract_proxy$ cat error.txt 
Traceback (most recent call last): 
    File "twitter_script.py", line 70, in <module> 
    search_twitter('alcoholics anonymous') 
    File "twitter_script.py", line 64, in search_twitter 
    db_name=write_data_to_db(*get_twitter_data(query)) 
    File "twitter_script.py", line 37, in get_twitter_data 
    html_full=return_html_code(url) 
    File "/home/ubuntu/social_network_extract_proxy/firefox_driver_code.py", line 35, in return_html_code 
    driver=webdriver.Firefox(firefox_profile=profile) 
    File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/webdriver.py", line 79, in __init__ 
    self.binary, timeout), 
    File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/extension_connection.py", line 49, in __init__ 
    self.binary.launch_browser(self.profile) 
    File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 68, in launch_browser 
    self._wait_until_connectable() 
    File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 106, in _wait_until_connectable 
    % (self.profile.path)) 
selenium.common.exceptions.WebDriverException: Message: Can't load the profile. Profile Dir: /tmp/tmpvFoPrE If you specified a log_file in the FirefoxBinary constructor, check it for details. 

一段時間後得到了上述錯誤。

+0

您可以提供您正在使用的twitter頁面,並在每次迭代時打印出'number_of_tweets' - 在停止之前您會看到多少個加載?謝謝。 – alecxe

+0

@alecxe我現在添加了這些問題,請現在檢查。謝謝你的建議。 –

+0

@alecxe url:'https://twitter.com/search?q = Error%20Check&src = typd&lang = en' and tweets retrived:'74' –

回答

2

這是一套東西,使得它爲我無頭模式下工作:

的代碼:

import time 

def return_html_code(url): 
    dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS) 
    dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36" 

    driver = webdriver.PhantomJS(desired_capabilities=dcap) 
    driver.maximize_window() 

    driver.get(url) 

    # initial wait for the tweets to load 
    wait = WebDriverWait(driver, 30) 
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) 
    # scroll down to the last tweet until there is no more tweets loaded 
    while True: 
     tweets = driver.find_elements_by_css_selector("li[data-item-id]") 
     number_of_tweets = len(tweets) 
     print(number_of_tweets) 

     # move to the top and then to the bottom 5 times in a row 
     for _ in range(5): 
      driver.execute_script("window.scrollTo(0, 0)") 
      driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1]) 
      time.sleep(0.5) 

     try: 
      wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) 
     except TimeoutException: 
      break 
+0

向我顯示此錯誤-dcap = dict(DesiredCapabilities.PHANTOMJS) NameError:全局名稱'DesiredCapabilities'未定義' –

+0

似乎不起作用。與以前相同的輸出。 –

+0

@AbhishekBhatia有趣的,適用於我,你使用'PhantomJS' 2.0.0和硒2.49? – alecxe