2015-02-06 125 views
0

在個人心血來潮中,我編寫了一些代碼來搜索任意兩篇維基百科文章之間最短的一系列鏈接。事實證明,如果它不僅僅是一個鏈接或兩個深度,而且它很有效,並且需要很長時間才能找到目標。我將最終跟蹤並利用鏈接路徑和內容,但我希望首先以最佳方式使搜索工作正常。有沒有更快的方法來做到這一點或在這裏削減一些主要角落的好方法?Python Wiki路徑搜索

import urllib2 
from bs4 import BeautifulSoup 
Start = 'http://en.wikipedia.org/wiki/Alan_Reid_%28politician%29' 
End = 'http://en.wikipedia.org/wiki/Ayr' 

#Using BeautifulSoup, this grabs the page 
def soup_request(target): 
    request = urllib2.Request(target) 
    request.add_header("User-Agent", "Mozilla/5.0") 
    page = urllib2.urlopen(target) 
    soup = BeautifulSoup(page) 
    return soup 

#This will grab all Wiki links off a given page 
def get_links(Start): 
    soup = soup_request(Start) 
    Wiki_links = [] 
    #Finds all links 
    for url in soup.findAll('a'): 
     result = url.get('href') 
     try: 
      if str(result)[:5] == '/wiki': 
       Wiki_links.append(result) 
     except: 
      pass 
    for q in range(len(Wiki_links)): 
     Wiki_links[q] = 'http://en.wikipedia.org'+str(Wiki_links[q]) 
    print "Got new links from",Start 
    return Wiki_links 

#This will check all the given links to see if the title matches the goal webpage 
def check_links(Links,End): 
    goalsoup = soup_request(End) 
    goaltitle = goalsoup.html.title 
    Found = False 
    count = 0 
    for q in Links: 
     if Found: 
      break 
     length = len(Links) 
     #Runs through all the given links and checks their titles for correct one 
     if q is not None: 
      count += 1 
      soup = soup_request(q) 
      print "Checked",count,"links out of",length 
      try: 
       title = soup.html.head.title 
       if title == goaltitle: 
        Found = True 
        print "Found it!" 
        break 
      except: 
       print 'doh' 
       pass 
    return Found 

#Top function to do all the stuff in the right order, applying a maximum depth of how deep into the links 
def wiki_crawl(Start, End, depth): 
    Old_Links = [Start] 
    count = depth 
    while count > 0: 
     New_Links = [] 
     for q in range(len(Old_Links)): 
      New_Links.extend(get_links(Old_Links[q])) 
     Found = check_links(New_Links,End) 
     if Found: 
      print "All done." 
      break 
     Old_Links = New_Links 
     count -= 1 
     print "_______________________________________________________________ROUND DONE" 
    if not Found: 
     print "Did not find the page, you must go deeper!" 

wiki_crawl(Start, End, 2) 

回答

0

以下是一些從wiki獲取信息的函數。唯一的問題是,它有時會從網頁上的信息中提取空間。

def take_out_parenthesis(st): 
string = list(st) 
for a in string: 
    if a == '(': 
     del string[st.find(a)] 
    if a == ')': 
     del string[st.find(a) - 1] 
return ''.join(string) 


def take_out_tags(string): 
    st = list(string) 
    odd = ['<', '>'] 
    times = 0 
    for a in string: 
     if a in odd: 
      times += 1 
    times /= 2 
    for b in range(times): 
     start = string.find('<') - 1 
     end = string.find('>') 
     bet = end - start + 1 
     for a in range(bet): 
      del st[start] 
     string = ''.join(st) 
    return string 


def take_out_brackets(string): 
    st = list(string) 
    odd = ['[', ']'] 
    times = 0 
    for a in string: 
     if a in odd: 
      times += 1 
    times /= 2 
    for b in range(times): 
     start = string.find('[') - 1 
     end = string.find(']') 
     bet = end - start + 1 
     for a in range(bet): 
      del st[start] 
     string = ''.join(st) 
    return string 


def take_from_web_page(text): 
    n = 0 
    url = text.replace(" ", "_") 
    search = "http://en.wikipedia.org/wiki/%s" % url 
    page = urllib2.urlopen(search).read() 
    start = page.find('<p><b>') + 6 
    end = page.find('</a>.', start) + 5 
    new_page = page[start:end] 
    for a in new_page: 
     if a == '<': 
      if new_page[n - 1] != ' ': 
       lst = list(new_page) 
       lst.insert(n, ' ') 
       new_page = ''.join(lst) 
       n += 1 
     n += 1 
    return take_out_parenthesis(take_out_brackets(take_out_tags(new_page)))