我已經用python編寫了一個計算機程序,但運行速度比我想要的要慢很多。有沒有辦法讓我的Python程序運行得更快?
下面是代碼:
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib
import urllib2
import cookielib
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b': # assume it's gzipped data
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
if TITLE_MATCH.match(listing_title) is not None:
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def log_in(final_url):
data = urllib.urlencode({"inUserName":"[email protected]", "inUserPass":"secretword"})
jar = cookielib.FileCookieJar("cookies")
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
opener.addheaders.append(('User-agent', 'Mozilla/4.0'))
opener.addheaders.append(('Referer', 'http://www.locationary.com/'))
opener.addheaders.append(('Cookie','site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.1.10.1325009956; __utmc=47547066'))
opener.addheaders.append(('Cookie','Cookie: site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.4.10.1325009956; __utmc=47547066'))
request = urllib2.Request("https://www.locationary.com/index.jsp?ACTION_TOKEN=tile_loginBar_jsp$JspView$LoginAction", data)
response = opener.open(request)
url = str(final_url)
anything = opener.open(url)
page = anything.read()
States = [#'Alabama',
#'Alaska',
'Arizona',
'Arkansas',
'California',
'Colorado',
'Connecticut',
'Delaware',
'Florida',
'Georgia',
'Hawaii',
'Idaho',
'Illinois',
'Indiana',
'Iowa',
'Kansas',
'Kentucky',
'Louisiana',
'Maine',
'Maryland',
'Massachusetts',
'Michigan',
'Minnesota',
'Mississippi',
'Missouri',
'Montana',
'Nebraska',
'Nevada',
'New_Hampshire',
'New_Jersey',
'New_Mexico',
'New_York',
'North_Carolina',
'North_Dakota',
'Ohio',
'Oklahoma',
'Oregon',
'Pennsylvania',
'Rhode_Island',
'South_Carolina',
'South_Dakota',
'Tennessee',
'Texas',
'Utah',
'Vermont',
'Virginia',
'Washington',
'West_Virginia',
'Wisconsin',
'Wyoming']
Cities = []
def find_cities(state):
state_url = 'http://www.locationary.com/place/en/US/' + str(state)
state_document = download(str(state_url))
findCities = re.compile('<b>(.*)</b>')
getCities = re.findall(findCities,state_document)
for City in getCities:
reps = {' ':'_'}
City = replace_chars(City, reps)
Cities.append(str(City))
bestworst = ['0','1']
def main():
for state in States:
find_cities(state)
for city in Cities:
for num in range(0,1):
for pagenum in range(15,16):
print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
if str(num) == '0':
print str(state) + ', ' + str(city) + ', ' + 'Best Profiles' + ', ' + 'Page ' + str(pagenum)
else:
print str(state) + ', ' + str(city) + ', ' + 'Worst Profiles' + ', ' + 'Page ' + str(pagenum)
START_URL = 'http://www.locationary.com/place/en/US/' + str(state) + '/' + city + '-page' + str(pagenum) + '/?ACTION_TOKEN=NumericAction&order=' + str(num)
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
count_listings = 0
for final_url in pool.imap(handle_listing, listings):
print final_url
if final_url is not None:
log_in(final_url)
if __name__ == '__main__':
main()
有沒有一種方法,使其更快或者是不可能的?它必須從互聯網下載很多網址,但我非常確定我的網絡連接速度比現在快10到50倍......而且我的電腦速度不是很慢......所以,有什麼辦法可以讓我的程序速度提高10-50倍?我知道這可能聽起來很荒謬,但專業程序員如何讓他們的程序更快呢?
它屬於http://codereview.stackexchange.com/ – 2011-12-30 16:45:39
您可以使用多個線程來獲取不同的頁面。 – 2011-12-30 16:46:14
專業程序員使他們的程序更快的方式是通過分析。看看Python的cProfile模塊。 – nmichaels 2011-12-30 16:47:14