links_list = char.getLinks(words)
for source_url in links_list:
try:
print 'Downloading URL: ' + source_url
urldict = hash_url(source_url)
source_url_short = urldict['url_short']
source_url_hash = urldict['url_short_hash']
if Url.objects.filter(source_url_short = source_url_short).count() == 0:
try:
htmlSource = getSource(source_url)
except:
htmlSource = '-'
print '\thtmlSource got an error...'
new_u = Url(source_url = source_url, source_url_short = source_url_short, source_url_hash = source_url_hash, html = htmlSource)
new_u.save()
time.sleep(3)
else:
print '\tAlready in database'
except:
print '\tError with downloading URL..'
time.sleep(3)
pass
def getSource(theurl, unicode = 1, moved = 0):
if moved == 1:
theurl = urllib2.urlopen(theurl).geturl()
urlReq = urllib2.Request(theurl)
urlReq.add_header('User-Agent',random.choice(agents))
urlResponse = urllib2.urlopen(urlReq)
htmlSource = urlResponse.read()
htmlSource = htmlSource.decode('utf-8').encode('utf-8')
return htmlSource
基本上這個代碼的作用是...它需要一個URL列表並下載它們,並將它們保存到數據庫中。就這樣。我的代碼是否泄漏內存(python)?
是有一個原因你認爲你的代碼泄漏內存? – Jehiah 2009-11-28 03:09:42
發生任何錯誤?或花費太多時間?雖然'htmlSource.decode('utf-8')。encode('utf-8')'這個技術很奇怪,它的解碼來自utf8並且同時編碼回utf8。 – YOU 2009-11-28 03:10:45
沒有錯誤發生。但是,我的腳本隨機被「殺死」。之前有人建議這是內存泄漏,導致我的內存過載。 – TIMEX 2009-11-28 03:12:14