2016-11-06 74 views
1

我爬行亞馬遜產品和原則,它會很好。爬行amazon.com

我有這個漂亮的教程三類:

http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/

我添加的文件到下面的代碼(類蜘蛛):

import java.io.FileNotFoundException; 
import java.util.*; 


public class Spider { 
    public static final int MAX_PAGES_TO_SEARCH = 10000; 
    private Set<String> pagesVisited = new HashSet<String>(); 
    private List<String> pagesToVisit = new LinkedList<String>(); 

    public void search(String url) { 
     while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) { 
     String currentUrl; 
     SpiderLeg leg = new SpiderLeg(); 
     if (this.pagesToVisit.isEmpty()) { 
      //System.out.println("abc"); 
      currentUrl = url; 
      this.pagesVisited.add(url); 
     } else { 
      //System.out.println("def"); 
      currentUrl = this.nextUrl(); 
     } 
     try { 
      Thread.sleep(10000); 
      leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in 
     } catch (FileNotFoundException e) { 
      System.out.println("Oops, FileNotFoundException caught"); 
     } catch (InterruptedException e) { 
      e.printStackTrace(); 
     } 

     this.pagesToVisit.addAll(leg.getLinks()); 
     //System.out.println("Test"); 
    } 
    System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)"); 
    SpiderLeg leg = new SpiderLeg(); 
    leg.calcAdjMatrix(); 
    for (int i = 0; i < leg.adjMatrix.length; i++) { 
     System.out.println(Arrays.toString(leg.adjMatrix[i])); 

    } 

} 

private String nextUrl() { 
    String nextUrl; 
    do { 
     if (this.pagesToVisit.isEmpty()){ 
      return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D"; 
     } 
     nextUrl = this.pagesToVisit.remove(0); 
    } while (this.pagesVisited.contains(nextUrl)); 
    this.pagesVisited.add(nextUrl); 
    return nextUrl; 
} 
} 

類SpiderLeg:

import org.jsoup.Connection; 
import org.jsoup.Jsoup; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 
import org.jsoup.select.Elements; 

import java.io.*; 
import java.util.*; 

public class SpiderLeg { 
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser. 
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"; 
    private static List<String> links = new LinkedList<String>(); 
    private static String graphLink; 
    private Document htmlDocument; 
    private static double counter = 0; 
    static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>(); 
    static int[][] adjMatrix; 
    static List<String> mapping; 

    public boolean crawl(String url) throws FileNotFoundException { 
     if (url.isEmpty()) { 
     return false; 
    } 
    try{ 
     Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT); 
     Document htmlDocument = connection.get(); 
     this.htmlDocument = htmlDocument; 
     if(connection.response().statusCode() == 200){ 
      // 200 is the HTTP OK status code 
      // indicating that everything is great. 
      counter++; 
      double progress; 
      progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100; 
      System.out.println("\n**Visiting** Received web page at " + url); 
      System.out.println("\n**Progress** " + progress + "%"); 
     } 
     if(!connection.response().contentType().contains("text/html")) { 
      System.out.println("**Failure** Retrieved something other than HTML"); 
      return false; 
     } 

     //Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]"); 
     Elements linksOnPage = htmlDocument.select("a[href*=/dp/]"); 
     Elements salesRank = htmlDocument.select("span.zg_hrsr_rank"); 
     Elements category = htmlDocument.select("span.zg_hrsr_ladder a"); 

     String categoryString = category.html(); 
     String salesRankString = salesRank.html(); 
     salesRankString = salesRankString.replace("\n", " "); 
     categoryString = categoryString.replace("\n", " "); 
     //System.out.println(categoryString); 
     System.out.println("Found (" + linksOnPage.size() + ") links"); 

     PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true)); 
     StringBuilder sb = new StringBuilder(); 

     int beginIndex = url.indexOf(".de/"); 
     int endIndex = url.indexOf("/dp"); 
     String title = url.substring(beginIndex+4,endIndex); 

     if(!adjMap.containsKey(title)){ 
      if(categoryString.contains("Horror")){ 
       adjMap.put(title, new HashSet<String>()); 
       sb.append(title); 
       sb.append(','); 
       sb.append(salesRankString); 
       sb.append(','); 
       sb.append(categoryString); 
       sb.append(','); 
       for(Element link : linksOnPage){ 
        String graphLink = link.attr("abs:href"); 
        if(!graphLink.contains("one-click")){ 
         if(!graphLink.contains("Kindle")){ 
          if(!graphLink.contains("unsticky")){ 
           this.links.add(graphLink); 
           //adjMap.get(url).add(graphLink); 
           adjMap.get(title).add(cutTitle(graphLink)); 
           sb.append(graphLink); 
           sb.append(','); 
          } 
         } 
        } 
       } 
      sb.append('\n'); 
      pw.write(sb.toString()); 
      pw.close(); 
      } 

     } 


     System.out.println("done!"); 
     return true; 
    } 
    catch(IOException ioe) { 
     // We were not successful in our HTTP request 
     System.out.println("Error in out HTTP request " + ioe); 
     return false; 
    } 
    } 

public static void calcAdjMatrix(){ 
    Set<String> allMyURLs = new HashSet(adjMap.keySet()); 
    for(String s: adjMap.keySet()){ 
     allMyURLs.addAll(adjMap.get(s)); 
     System.out.println(s + "\t" + adjMap.get(s)); 
    } 

    int dim = allMyURLs.size(); 
    adjMatrix = new int[dim][dim]; 
    List<String> nodes_list = new ArrayList<>(); 
    for(String s: allMyURLs){ 
     nodes_list.add(s); 
    } 

    for(String s: nodes_list){ 
     Set<String> outEdges = adjMap.get(s); 
     int i = nodes_list.indexOf(s); 
     if(outEdges != null){ 
      for(String s1: outEdges){ 
       int j = nodes_list.indexOf(s1); 
       adjMatrix[i][j] = 1; 
      } 
     } 

    } 

} 

public String cutTitle(String url) throws FileNotFoundException{ 
    int beginIndex = url.indexOf(".de/"); 
    int endIndex = url.indexOf("/dp"); 
    String title; 
    if(url.contains(".de") && url.contains("/dp")){ 
     title = url.substring(beginIndex+4,endIndex); 
    }else{ 
     title = "wrong url"; 
    } 

    return title; 
} 
public boolean searchForWord(String searchWord) { 

    if(this.htmlDocument == null) { 
     System.out.println("ERROR! Call crawl() before performing analysis on the document"); 
     return false; 
    } 
    System.out.println("Searching for the word " + searchWord + "..."); 
    String bodyText = this.htmlDocument.body().text(); 
    return bodyText.toLowerCase().contains(searchWord.toLowerCase()); 
} 


public List<String> getLinks(){ 
    return this.links; 
} 

}

類SpiderTest:

public class SpiderTest { 
    public static void main(String[] args) { 
     Spider spider = new Spider(); 
     spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6"); 
    } 
} 

現在的問題是,經過100個網址,我認爲,亞馬遜從服務器禁止我。該程序不再找到URL。

有沒有人有一個想法,我可以解決這個問題?

+0

你可能想看看http://crawljax.com--一個帶有ajax支持的全自動web爬蟲。 – Julian

+2

[亞馬遜的使用條件](https://www.amazon.com/gp/help/customer/display.html/ref=footer_cou?nodeId=508088):'此許可證不包括任何...;或任何使用數據挖掘,機器人或類似的數據收集和提取工具。' – greybeard

+0

可能相關:http://stackoverflow.com/questions/11080584/is-it-legal-to-crawl-amazon –

回答

3

那麼,不要粗魯和抓取他們。

檢查他們的robots.txtwiki),看看他們允許你做什麼。如果你去了那些他們不想讓你去的地方,他們就會禁止你,不要感到驚訝。

+0

「不允許:/ dp/product-availability /」。如何修復這一行:元素linksOnPage = htmlDocument.select(「a [href * =/dp /]」);它跳過了「產品可用性」?謝謝你的提示。 – moses

+0

@NicoHoppel如何使用像Nutch,StormCrawler或Scrapy這樣的合適的抓取工具而不是重新發明輪子? StormCrawler或Nutch將應用robots.txt指令 –

2

當您嘗試抓取不想被抓取的大網站時,該問題非常常見。他們基本上阻止了你一段時間,以防止他們的數據被抓取或被盜。

就這麼說,你有兩個選擇,要麼從不同的IP /服務器發出的每個請求,這將使您的請求看起來合法,並避免禁止,或去最簡單的方法是使用服務那對你。

我已經完成了這兩個工作,第一個是複雜的,需要時間和需要維護(你必須建立一個服務器網絡),第二個選項通常不是免費的,但實施起來非常快,並保證所有的請求將始終返回數據,您不會被禁止。

互聯網上有一些服務可以做到這一點。我在過去使用過proxycrawl(也有免費套餐),效果非常好。他們有一個可以調用的API,只能使用相同的代碼,只需更改您調用的url即可。

這將是amazon一個例子:

GET https://api.proxycrawl.com?token=yourtoken&url=https://amazon.com 

而你也總是得到迴應,即使你抓取1000頁第二,你將永遠不會被禁止,因爲你會被調用一個代理,而不是其爲你做所有的魔術。

我希望它能幫助:)