2011-12-30 102 views
0

我已經用python編寫了一個計算機程序,但運行速度比我想要的要慢很多。有沒有辦法讓我的Python程序運行得更快?

下面是代碼:

from gzip import GzipFile 
from cStringIO import StringIO 
import re 
import webbrowser 
import time 
from difflib import SequenceMatcher 
import os 
import sys 
from BeautifulSoup import BeautifulSoup 
import eventlet 
from eventlet.green import urllib2 
import urllib 
import urllib2 
import cookielib 

TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$') 
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$') 
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp') 

def download(url): 
    print "Downloading:", url 
    s = urllib2.urlopen(url).read() 
    if s[:2] == '\x1f\x8b': # assume it's gzipped data 
     ifh = GzipFile(mode='rb', fileobj=StringIO(s)) 
     s = ifh.read() 
    print "Downloaded: ", url 
    return s 

def replace_chars(text, replacements): 
    return ''.join(replacements.get(x,x) for x in text) 

def handle_listing(listing_url): 
    listing_document = BeautifulSoup(download(listing_url)) 

    # ignore pages that link to yellowpages 
    if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")): 
     listing_title = listing_document.title.text 
     reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''} 
     if TITLE_MATCH.match(listing_title) is not None: 
      title, = TITLE_MATCH.match(listing_title).groups() 
      address, = ADDRESS_MATCH.match(listing_title).groups() 

      yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
       replace_chars(address, reps), 
       replace_chars(title, reps), 
      ) 

      yellow_page = BeautifulSoup(download(yellow_page_url)) 

      page_url = yellow_page.find("h3", {"class" : "business-name fn org"}) 
      if page_url: 
       page_url = page_url.a["href"] 

       business_name = title[:title.index(",")] 

       page = BeautifulSoup(download(page_url)) 
       yellow_page_address = page.find("span", {"class" : "street-address"}) 
       if yellow_page_address: 

        if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5: 
         pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0) 
         page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'}) 

         final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
           pid, page_escaped) 
         return final_url 

def log_in(final_url): 
    data = urllib.urlencode({"inUserName":"[email protected]", "inUserPass":"secretword"}) 
    jar = cookielib.FileCookieJar("cookies") 
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar)) 
    opener.addheaders.append(('User-agent', 'Mozilla/4.0')) 
    opener.addheaders.append(('Referer', 'http://www.locationary.com/')) 
    opener.addheaders.append(('Cookie','site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.1.10.1325009956; __utmc=47547066')) 
    opener.addheaders.append(('Cookie','Cookie: site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.4.10.1325009956; __utmc=47547066')) 
    request = urllib2.Request("https://www.locationary.com/index.jsp?ACTION_TOKEN=tile_loginBar_jsp$JspView$LoginAction", data) 
    response = opener.open(request) 
    url = str(final_url) 
    anything = opener.open(url) 
    page = anything.read() 

States = [#'Alabama', 
      #'Alaska', 
      'Arizona', 
      'Arkansas', 
      'California', 
      'Colorado', 
      'Connecticut', 
      'Delaware', 
      'Florida', 
      'Georgia', 
      'Hawaii', 
      'Idaho', 
      'Illinois', 
      'Indiana', 
      'Iowa', 
      'Kansas', 
      'Kentucky', 
      'Louisiana', 
      'Maine', 
      'Maryland', 
      'Massachusetts', 
      'Michigan', 
      'Minnesota', 
      'Mississippi', 
      'Missouri', 
      'Montana', 
      'Nebraska', 
      'Nevada', 
      'New_Hampshire', 
      'New_Jersey', 
      'New_Mexico', 
      'New_York', 
      'North_Carolina', 
      'North_Dakota', 
      'Ohio', 
      'Oklahoma', 
      'Oregon', 
      'Pennsylvania', 
      'Rhode_Island', 
      'South_Carolina', 
      'South_Dakota', 
      'Tennessee', 
      'Texas', 
      'Utah', 
      'Vermont', 
      'Virginia', 
      'Washington', 
      'West_Virginia', 
      'Wisconsin', 
      'Wyoming'] 

Cities = [] 

def find_cities(state): 
    state_url = 'http://www.locationary.com/place/en/US/' + str(state) 
    state_document = download(str(state_url)) 
    findCities = re.compile('<b>(.*)</b>') 
    getCities = re.findall(findCities,state_document) 

    for City in getCities: 
     reps = {' ':'_'} 
     City = replace_chars(City, reps) 
     Cities.append(str(City)) 

bestworst = ['0','1'] 

def main(): 
    for state in States: 
     find_cities(state) 
     for city in Cities: 
      for num in range(0,1): 
       for pagenum in range(15,16): 
        print '------------------------------------------------------------------------------------------------------------------------------------------------------------' 
        print '------------------------------------------------------------------------------------------------------------------------------------------------------------' 
        if str(num) == '0': 
         print str(state) + ', ' + str(city) + ', ' + 'Best Profiles' + ', ' + 'Page ' + str(pagenum) 
        else: 
         print str(state) + ', ' + str(city) + ', ' + 'Worst Profiles' + ', ' + 'Page ' + str(pagenum) 
        START_URL = 'http://www.locationary.com/place/en/US/' + str(state) + '/' + city + '-page' + str(pagenum) + '/?ACTION_TOKEN=NumericAction&order=' + str(num) 
        pool = eventlet.GreenPool() 
        listings_document = BeautifulSoup(download(START_URL)) 
        listings = listings_document.findAll("a", href = LOCATION_LISTING) 
        listings = [listing['href'] for listing in listings] 

        count_listings = 0 

        for final_url in pool.imap(handle_listing, listings): 
         print final_url 
         if final_url is not None: 
          log_in(final_url) 

if __name__ == '__main__': 
    main() 

有沒有一種方法,使其更快或者是不可能的?它必須從互聯網下載很多網址,但我非常確定我的網絡連接速度比現在快10到50倍......而且我的電腦速度不是很慢......所以,有什麼辦法可以讓我的程序速度提高10-50倍?我知道這可能聽起來很荒謬,但專業程序員如何讓他們的程序更快呢?

+2

它屬於http://codereview.stackexchange.com/ – 2011-12-30 16:45:39

+0

您可以使用多個線程來獲取不同的頁面。 – 2011-12-30 16:46:14

+0

專業程序員使他們的程序更快的方式是通過分析。看看Python的cProfile模塊。 – nmichaels 2011-12-30 16:47:14

回答

6

加速任何程序的第一步是瞭解爲什麼它很慢 - 即時間到了哪裏?工具程序員用來做這件事叫做profiler。標準Python包括幾個這些:你可以瞭解他們here.

一旦你學會了使用分析器,在你的程序來識別熱點,或位置在程序中花費時間最多的運行它。然後嘗試通過以下兩種方式之一加速程序:

  1. 儘量讓熱點花費更少的時間;或
  2. 儘量讓熱點執行次數更少。

通常情況下#2會更有成效。選擇更好或更合適的算法可以減少執行的代碼量。

不要浪費時間猜測爲什麼程序很慢; 衡量它,然後投入你的精力來解決真正的問題。程序員在猜測性能問題出在哪裏的時候是非常糟糕的。

2

程序員優化代碼的方式是使用探查器,python使幾個可用。 Here is a great article讓你開始。

可以通過命令行調用timeit

python -m timeit myprogram.py 

上面的鏈路具有一堆使用timeit的例子。一旦你找出你的瓶頸所在,你可以想辦法解決它們。如果你的程序在下載()函數中花費了過多的時間,你可以考慮引入某種併發性並在後臺下載東西,而你的程序繼續使用BeautifulSoup來解析已經存在的東西的提取信息下載。

這裏的關鍵是看:

  1. 如果你的程序花費了大部分時間。
  2. 在那裏你可以優化的最簡單的

對於一個假設的例子在步驟#1的位置,如果你的正則表達式特別寫得不好,他們可以花費很長的時間,然後你就可以工作進行優化。我說的是「假設的」,因爲在實踐中,你的正則表達式不可能是一個重要的瓶頸,除非你正在執行它們數百萬次或者類似的奇怪事情。

相關問題