爬行amazon.com

我爬行亞馬遜產品和原則，它會很好。爬行amazon.com

我有這個漂亮的教程三類：

http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/

我添加的文件到下面的代碼（類蜘蛛）：

import java.io.FileNotFoundException; 
import java.util.*; 


public class Spider { 
    public static final int MAX_PAGES_TO_SEARCH = 10000; 
    private Set<String> pagesVisited = new HashSet<String>(); 
    private List<String> pagesToVisit = new LinkedList<String>(); 

    public void search(String url) { 
     while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) { 
     String currentUrl; 
     SpiderLeg leg = new SpiderLeg(); 
     if (this.pagesToVisit.isEmpty()) { 
      //System.out.println("abc"); 
      currentUrl = url; 
      this.pagesVisited.add(url); 
     } else { 
      //System.out.println("def"); 
      currentUrl = this.nextUrl(); 
     } 
     try { 
      Thread.sleep(10000); 
      leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in 
     } catch (FileNotFoundException e) { 
      System.out.println("Oops, FileNotFoundException caught"); 
     } catch (InterruptedException e) { 
      e.printStackTrace(); 
     } 

     this.pagesToVisit.addAll(leg.getLinks()); 
     //System.out.println("Test"); 
    } 
    System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)"); 
    SpiderLeg leg = new SpiderLeg(); 
    leg.calcAdjMatrix(); 
    for (int i = 0; i < leg.adjMatrix.length; i++) { 
     System.out.println(Arrays.toString(leg.adjMatrix[i])); 

    } 

} 

private String nextUrl() { 
    String nextUrl; 
    do { 
     if (this.pagesToVisit.isEmpty()){ 
      return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D"; 
     } 
     nextUrl = this.pagesToVisit.remove(0); 
    } while (this.pagesVisited.contains(nextUrl)); 
    this.pagesVisited.add(nextUrl); 
    return nextUrl; 
} 
}

類SpiderLeg：

import org.jsoup.Connection; 
import org.jsoup.Jsoup; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 
import org.jsoup.select.Elements; 

import java.io.*; 
import java.util.*; 

public class SpiderLeg { 
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser. 
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"; 
    private static List<String> links = new LinkedList<String>(); 
    private static String graphLink; 
    private Document htmlDocument; 
    private static double counter = 0; 
    static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>(); 
    static int[][] adjMatrix; 
    static List<String> mapping; 

    public boolean crawl(String url) throws FileNotFoundException { 
     if (url.isEmpty()) { 
     return false; 
    } 
    try{ 
     Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT); 
     Document htmlDocument = connection.get(); 
     this.htmlDocument = htmlDocument; 
     if(connection.response().statusCode() == 200){ 
      // 200 is the HTTP OK status code 
      // indicating that everything is great. 
      counter++; 
      double progress; 
      progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100; 
      System.out.println("\n**Visiting** Received web page at " + url); 
      System.out.println("\n**Progress** " + progress + "%"); 
     } 
     if(!connection.response().contentType().contains("text/html")) { 
      System.out.println("**Failure** Retrieved something other than HTML"); 
      return false; 
     } 

     //Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]"); 
     Elements linksOnPage = htmlDocument.select("a[href*=/dp/]"); 
     Elements salesRank = htmlDocument.select("span.zg_hrsr_rank"); 
     Elements category = htmlDocument.select("span.zg_hrsr_ladder a"); 

     String categoryString = category.html(); 
     String salesRankString = salesRank.html(); 
     salesRankString = salesRankString.replace("\n", " "); 
     categoryString = categoryString.replace("\n", " "); 
     //System.out.println(categoryString); 
     System.out.println("Found (" + linksOnPage.size() + ") links"); 

     PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true)); 
     StringBuilder sb = new StringBuilder(); 

     int beginIndex = url.indexOf(".de/"); 
     int endIndex = url.indexOf("/dp"); 
     String title = url.substring(beginIndex+4,endIndex); 

     if(!adjMap.containsKey(title)){ 
      if(categoryString.contains("Horror")){ 
       adjMap.put(title, new HashSet<String>()); 
       sb.append(title); 
       sb.append(','); 
       sb.append(salesRankString); 
       sb.append(','); 
       sb.append(categoryString); 
       sb.append(','); 
       for(Element link : linksOnPage){ 
        String graphLink = link.attr("abs:href"); 
        if(!graphLink.contains("one-click")){ 
         if(!graphLink.contains("Kindle")){ 
          if(!graphLink.contains("unsticky")){ 
           this.links.add(graphLink); 
           //adjMap.get(url).add(graphLink); 
           adjMap.get(title).add(cutTitle(graphLink)); 
           sb.append(graphLink); 
           sb.append(','); 
          } 
         } 
        } 
       } 
      sb.append('\n'); 
      pw.write(sb.toString()); 
      pw.close(); 
      } 

     } 


     System.out.println("done!"); 
     return true; 
    } 
    catch(IOException ioe) { 
     // We were not successful in our HTTP request 
     System.out.println("Error in out HTTP request " + ioe); 
     return false; 
    } 
    } 

public static void calcAdjMatrix(){ 
    Set<String> allMyURLs = new HashSet(adjMap.keySet()); 
    for(String s: adjMap.keySet()){ 
     allMyURLs.addAll(adjMap.get(s)); 
     System.out.println(s + "\t" + adjMap.get(s)); 
    } 

    int dim = allMyURLs.size(); 
    adjMatrix = new int[dim][dim]; 
    List<String> nodes_list = new ArrayList<>(); 
    for(String s: allMyURLs){ 
     nodes_list.add(s); 
    } 

    for(String s: nodes_list){ 
     Set<String> outEdges = adjMap.get(s); 
     int i = nodes_list.indexOf(s); 
     if(outEdges != null){ 
      for(String s1: outEdges){ 
       int j = nodes_list.indexOf(s1); 
       adjMatrix[i][j] = 1; 
      } 
     } 

    } 

} 

public String cutTitle(String url) throws FileNotFoundException{ 
    int beginIndex = url.indexOf(".de/"); 
    int endIndex = url.indexOf("/dp"); 
    String title; 
    if(url.contains(".de") && url.contains("/dp")){ 
     title = url.substring(beginIndex+4,endIndex); 
    }else{ 
     title = "wrong url"; 
    } 

    return title; 
} 
public boolean searchForWord(String searchWord) { 

    if(this.htmlDocument == null) { 
     System.out.println("ERROR! Call crawl() before performing analysis on the document"); 
     return false; 
    } 
    System.out.println("Searching for the word " + searchWord + "..."); 
    String bodyText = this.htmlDocument.body().text(); 
    return bodyText.toLowerCase().contains(searchWord.toLowerCase()); 
} 


public List<String> getLinks(){ 
    return this.links; 
}

}

類SpiderTest：

public class SpiderTest { 
    public static void main(String[] args) { 
     Spider spider = new Spider(); 
     spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6"); 
    } 
}

現在的問題是，經過100個網址，我認爲，亞馬遜從服務器禁止我。該程序不再找到URL。

有沒有人有一個想法，我可以解決這個問題？

來源

2016-11-06 moses

你可能想看看http://crawljax.com--一個帶有ajax支持的全自動web爬蟲。 – Julian

[亞馬遜的使用條件]（https://www.amazon.com/gp/help/customer/display.html/ref=footer_cou?nodeId=508088）：'此許可證不包括任何...;或任何使用數據挖掘，機器人或類似的數據收集和提取工具。' – greybeard

可能相關：http：//stackoverflow.com/questions/11080584/is-it-legal-to-crawl-amazon –

那麼，不要粗魯和抓取他們。

檢查他們的robots.txt（wiki），看看他們允許你做什麼。如果你去了那些他們不想讓你去的地方，他們就會禁止你，不要感到驚訝。

來源

2016-11-06 10:39:44 nvoigt

「不允許：/ dp/product-availability /」。如何修復這一行：元素linksOnPage = htmlDocument.select（「a [href * =/dp /]」）;它跳過了「產品可用性」？謝謝你的提示。 – moses

@NicoHoppel如何使用像Nutch，StormCrawler或Scrapy這樣的合適的抓取工具而不是重新發明輪子？ StormCrawler或Nutch將應用robots.txt指令 –

當您嘗試抓取不想被抓取的大網站時，該問題非常常見。他們基本上阻止了你一段時間，以防止他們的數據被抓取或被盜。

就這麼說，你有兩個選擇，要麼從不同的IP /服務器發出的每個請求，這將使您的請求看起來合法，並避免禁止，或去最簡單的方法是使用服務那對你。

我已經完成了這兩個工作，第一個是複雜的，需要時間和需要維護（你必須建立一個服務器網絡），第二個選項通常不是免費的，但實施起來非常快，並保證所有的請求將始終返回數據，您不會被禁止。

互聯網上有一些服務可以做到這一點。我在過去使用過proxycrawl（也有免費套餐），效果非常好。他們有一個可以調用的API，只能使用相同的代碼，只需更改您調用的url即可。

這將是amazon一個例子：

GET https://api.proxycrawl.com?token=yourtoken&url=https://amazon.com

而你也總是得到迴應，即使你抓取1000頁第二，你將永遠不會被禁止，因爲你會被調用一個代理，而不是其爲你做所有的魔術。

我希望它能幫助:)

來源

2017-05-23 13:26:57 ajimix

爬行amazon.com

回答

相關問題