2016-03-27 25 views
-3

如何使用併發執行程序重新實現此操作,或者更好的方法。意思是threadpool執行者。 基本上我希望抓取工具抓取給定的網址,也許以後跟隨其他網站發現的網址等等。如何使用線程池執行器重新實現這個?

package Mainpackge; 

import java.io.IOException; 

import java.util.ArrayList; 
import java.util.List; 

import org.jsoup.Jsoup; 
import org.jsoup.nodes.Document; 
import org.jsoup.nodes.Element; 
import org.jsoup.select.Elements; 

public class main { 

    public static void main(String[] args) { 
     //List of urs to collect data from 
     String[] urls = new String[]{ 

       "http://www.answers.com/", 
       "http://www.britannica.com/", 
       "https://ie.yahoo.com/?p=us", 
       "https://en.wikipedia.org/wiki/Main_Page", 
       "http://ww w.worldbook.com/", 
       "http://www.computerlanguage.com/", 
       "http://www.howstuffworks.com/", 
       "http://www.dmoz.org/Computers/Computer_Science/" 
       }; 

     // Create and start workers 
     List<Worker> workers = new ArrayList<>(urls.length); 
     for (String url : urls) { 
      Worker w = new Worker(url); 
      workers.add(w); 
      new Thread(w).start(); 
     } 

     // Retrieve results 
     for (Worker w : workers) { 
      Elements results = w.waitForResults(); 
      if (results != null) 
       for (Element result : results) { result.absUrl("a") ; 
        System.out.println(w.getName()+": "+result.absUrl("href")); 
       } 

      else 
       System.err.println(w.getName()+" had some error!"); 
     } 
    } 
} 

class Worker implements Runnable { 

    private String url; 
    private Elements results; 
    private String name; 
    private static int number = 0; 

    private final Object lock = new Object(); 

    public Worker(String url) { 
     this.url = url; 
     this.name = "Worker-" + (number++); 
    } 

    public String getName() { 
     return name; 
    } 

    @Override 
    public void run() { 
     try { 
      Document doc = Jsoup.connect(this.url).get(); 

      Elements links = doc.select("a"); 

      // Update results 
      synchronized (lock) { 
       this.results = links; 
       lock.notifyAll(); 
      } 
     } catch (IOException e) { 
      // You should implement a better error handling code.. 
      System.err.println("Error while parsing: "+this.url); 
      e.printStackTrace(); 
     } 
    } 

    public Elements waitForResults() { 
     synchronized (lock) { 
      try { 
       while (this.results == null) { 
        lock.wait(); 
       } 
       return this.results; 
      } catch (InterruptedException e) { 
       // Again better error handling 
       e.printStackTrace(); 
      } 

      return null; 
     } 
    } 
} 

回答

0

爲您的線程使用ExecutorService和Callable實現的完整示例。

import java.util.ArrayList; 
import java.util.Arrays; 
import java.util.List; 
import java.util.concurrent.Callable; 
import java.util.concurrent.ExecutionException; 
import java.util.concurrent.ExecutorService; 
import java.util.concurrent.Executors; 
import java.util.concurrent.Future; 
import java.util.concurrent.TimeUnit; 

public class ThreadPoolExample { 
    public static void main(String[] args) throws InterruptedException, ExecutionException { 
     List<String> urls = Arrays.asList(new String[]{ 
       "http://www.answers.com/", 
       "http://www.britannica.com/", 
       "https://ie.yahoo.com/?p=us", 
       "https://en.wikipedia.org/wiki/Main_Page", 
       "http://ww w.worldbook.com/", 
       "http://www.computerlanguage.com/", 
       "http://www.howstuffworks.com/", 
       "http://www.dmoz.org/Computers/Computer_Science/" 
       }); 

     ExecutorService ex = Executors.newFixedThreadPool(10); 
     ex.awaitTermination(2, TimeUnit.SECONDS); 

     List<Future<Element>> results = new ArrayList<>(); 
     for (String string : urls) { 
      results.add(ex.submit(new Crawler(string))); 
     } 

     for (Future<Element> future : results) { 
      // Get will wait for the thread to be done 
      for (String url : future.get().urls) { 
       // ADD A NEW THREAD FOR EACH URLS YOU FOUND ! 
       ex.submit(new Crawler(url)); 
      } 
     } 
     ex.shutdown(); 
    } 

    public static class Crawler implements Callable<Element>{ 
     String url; 
     public Crawler(String url) { 
      this.url = url; 
     } 
     @Override 
     public Element call() throws Exception { 
      // Implement your crawling logic and return your elements 
      return new Element(Arrays.asList(new String[]{"all new urls", "that you found while crwaling"})); 
     } 

    } 

    public static class Element{ 
     List<String> urls; 
     public Element(List<String> urls) { 
      this.urls = urls; 
     } 
     @Override 
     public String toString() { 
      return "Elements found : " + urls.size(); 
     } 
    } 
} 
+0

爲什麼在instanciating執行器服務後立即調用awaitTermination?爲什麼線程池被設置爲10?傳統上,線程池大小設置爲可用處理器的數量('Runtime.getRuntime()。availableProcessors()')。請參閱:http://stackoverflow.com/a/1980858/363573 – Stephan