2016-09-22 40 views
3

我正在爲tshirt網站構建內容刮取器。我如何用承諾重寫?

的目標是隻通過一個硬編碼的網址,輸入網址:http://shirts4mike.com

那麼我會找到所有的T恤每一個產品頁面,然後創建一個對象與它的細節。然後將其添加到數組。

當數組中充滿了t恤,我將通過數組處理並將其記錄到CSV文件中。

現在,我在請求/響應和函數調用的時間方面遇到了一些麻煩。

如何確保在正確的時間調用NEXT函數?我明白,它不工作,因爲它是異步的性質。

如何在正確的時間撥打secondScrapelastScraperconvertJson2Csv,以便他們所使用的變量不是未定義的?

我試圖使用諸如response.end()之類的東西,但這不起作用。

我假設我需要使用承諾,使其正常工作?並且清晰可辨?

任何想法?我的代碼如下:

//Modules being used: 
var cheerio = require('cheerio'); 
var request = require('request'); 
var moment = require('moment'); 

//hardcoded url 
var url = 'http://shirts4mike.com/'; 

//url for tshirt pages 
var urlSet = new Set(); 

var remainder; 
var tshirtArray; 


// Load front page of shirts4mike 
request(url, function(error, response, html) { 
    if(!error && response.statusCode == 200){ 
     var $ = cheerio.load(html); 

    //iterate over links with 'shirt' 
     $("a[href*=shirt]").each(function(){ 
      var a = $(this).attr('href'); 

      //create new link 
      var scrapeLink = url + a; 

      //for each new link, go in and find out if there is a submit button. 
      //If there, add it to the set 
      request(scrapeLink, function(error,response, html){ 
       if(!error && response.statusCode == 200) { 
        var $ = cheerio.load(html); 

        //if page has a submit it must be a product page 
        if($('[type=submit]').length !== 0){ 

         //add page to set 
         urlSet.add(scrapeLink); 

        } else if(remainder === undefined) { 
         //if not a product page, add it to remainder so it another scrape can be performed. 
         remainder = scrapeLink;      
        } 
       } 
      }); 
     });  
    } 
    //call second scrape for remainder 
    secondScrape(); 
}); 


function secondScrape() { 
    request(remainder, function(error, response, html) { 
     if(!error && response.statusCode == 200){ 
      var $ = cheerio.load(html); 

      $("a[href*=shirt]").each(function(){ 
       var a = $(this).attr('href'); 

       //create new link 
       var scrapeLink = url + a; 

       request(scrapeLink, function(error,response, html){ 
        if(!error && response.statusCode == 200){ 

         var $ = cheerio.load(html); 

         //collect remaining product pages and add to set 
         if($('[type=submit]').length !== 0){ 
          urlSet.add(scrapeLink); 
         } 
        } 
       }); 
      });  
     } 
    }); 
    console.log(urlSet); 
    //call lastScraper so we can grab data from the set (product pages) 
    lastScraper(); 
}; 



function lastScraper(){ 
    //scrape set, product pages 
    for(var i = 0; i < urlSet.length; i++){ 
     var url = urlSet[i]; 

     request(url, function(error, response, html){ 
      if(!error && response.statusCode == 200){ 
       var $ = cheerio.load(html); 

       //grab data and store as variables 
       var price = $('.price').text(); 
       var img = $('.shirt-picture').find("img").attr("src"); 
       var title = $('body').find(".shirt-details > h1").text().slice(4); 

       var tshirtObject = {}; 
       //add values into tshirt object 

       tshirtObject.price = price; 
       tshirtObject.img = img; 
       tshirtObject.title = title; 
       tshirtObject.url = url; 
       tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a'); 

       //add the object into the array of tshirts 
       tshirtArray.push(tshirtObject); 
      } 
     }); 
    } 
    //call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged 
    convertJson2Csv(); 
}; 
+0

一個簡單的轉換爲承諾仍然會給你一個效率低下的過程,在這個過程中頁面被重新訪問。一個好的範例,將允許每個頁面最多訪問一次。 –

回答

0

有一個叫做request-promise的npm模塊。

簡單:

var rp = require("request-promise"); 

以及任何你正在你可以請求承諾切換的請求。

例如:

rp(url) 
.then(function(value){ 
    //do whatever 
}) 
.catch(function(err){ 
    console.log(err) 
}) 
0

可以使用async模塊waterfall方法,它可以給你一個平穩的方式來解決這個問題。

我只是嘗試使用此模塊

希望這會爲你

瀑布的形式工作,做你的代碼

async.waterfall([ 
    function(callback) { 
    callback(null, previousvalue); 
    }, 
    function(previousvalue, callback) {} 
], function(err, result) { //Final callback 

}); 

var async = require('async'); 
var cheerio = require('cheerio'); 
var request = require('request'); 
var moment = require('moment'); 

//hardcoded url 
var url = 'http://shirts4mike.com/'; 

//url for tshirt pages 
var urlSet = new Set(); 

var remainder; 
var tshirtArray = []; 


async.waterfall([ 
    function(callback) { 
    // Load front page of shirts4mike 
    request(url, function(error, response, html) { 
     if (!error && response.statusCode == 200) { 
     var $ = cheerio.load(html); 

     //iterate over links with 'shirt' 
     $("a[href*=shirt]").each(function() { 
      var a = $(this).attr('href'); 

      //create new link 
      var scrapeLink = url + a; 

      //for each new link, go in and find out if there is a submit button. 
      //If there, add it to the set 
      request(scrapeLink, function(error, response, html) { 
      if (!error && response.statusCode == 200) { 
       var $ = cheerio.load(html); 

       //if page has a submit it must be a product page 
       if ($('[type=submit]').length !== 0) { 

       //add page to set 
       urlSet.add(scrapeLink); 
       callback(null, true); 

       } else if (remainder === undefined) { 
       //if not a product page, add it to remainder so it another scrape can be performed. 
       remainder = scrapeLink; 
       callback(nul, true); 
       } 
      } 
      }); 
     }); 
     } 
     //call second scrape for remainder 
     // secondScrape(); 
    }); 
    }, 
    function(previousvalue, callback) { 
    request(remainder, function(error, response, html) { 
     if (!error && response.statusCode == 200) { 
     var $ = cheerio.load(html); 

     $("a[href*=shirt]").each(function() { 
      var a = $(this).attr('href'); 

      //create new link 
      var scrapeLink = url + a; 

      request(scrapeLink, function(error, response, html) { 
      if (!error && response.statusCode == 200) { 

       var $ = cheerio.load(html); 

       //collect remaining product pages and add to set 
       if ($('[type=submit]').length !== 0) { 
       urlSet.add(scrapeLink); 
       } 
       callback(null, true); 
      } 
      }); 
     }); 
     } 
    }); 
    console.log(urlSet); 
    //call lastScraper so we can grab data from the set (product pages) 
    }, 
    function(previousvalue, callback) { 
    //scrape set, product pages 
    for (var i = 0; i < urlSet.length; i++) { 
     var url = urlSet[i]; 

     request(url, function(error, response, html) { 
     if (!error && response.statusCode == 200) { 
      var $ = cheerio.load(html); 

      //grab data and store as variables 
      var price = $('.price').text(); 
      var img = $('.shirt-picture').find("img").attr("src"); 
      var title = $('body').find(".shirt-details > h1").text().slice(4); 

      var tshirtObject = {}; 
      //add values into tshirt object 

      tshirtObject.price = price; 
      tshirtObject.img = img; 
      tshirtObject.title = title; 
      tshirtObject.url = url; 
      tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a'); 

      //add the object into the array of tshirts 
      tshirtArray.push(tshirtObject); 
     } 
     }); 
    } 
    } 
], function(err, result) { 
    //call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged 
    convertJson2Csv(); 
}); 
0

你可以用這個例子來轉換你的代碼示例的其餘部分。

promise = new Promise((resolve, reject) => ( 
    request("http://shirts4mike.com/", 
    (err, response, html) => (response.statusCode == 200 ? resolve(html): reject(err)) 
))); 


promise.then(html => { 
    var $ = cheerio.load(html); 
    // continue 
}); 
+0

我會爲每一個請求創建一個新的承諾嗎? – bloppit

+0

是的,這將是一段路要走。 –

+0

我認爲你可以重構一個函數,該函數接受一個URL並返回一個承諾,併爲每個URL請求調用一次該函數。 –

0

您正確識別承諾是解決您的時間問題的一種方式。

爲了有承諾可用,您需要promisify request(或採用HTTP庫,其方法返回承諾)。

您可以用承諾來解決時間問題,但您也可以藉此機會改進整體範例。您可以編寫一個遞歸調用自己的函數,而不是實際上相同的第一/第二/第三階段的離散函數。正確書寫,這將確保目標網站中的每個頁面最多訪問一次;應該以整體性能爲基礎避免重新訪問,並加載目標服務器。

//Modules being used: 
var Promise = require('path/to/bluebird'); 
var cheerio = require('cheerio'); 
var moment = require('moment'); 

// Promisify `request` to make `request.getAsync()` available. 
// Ref: http://stackoverflow.com/questions/28308131/how-do-you-properly-promisify-request 
var request = Promise.promisify(require('request')); 
Promise.promisifyAll(request); 

//hardcoded url 
var url = 'http://shirts4mike.com/'; 

var urlSet = new Set(); 
var tshirtArray = []; 

var maxLevels = 3; // limit the recursion to this number of levels. 

function scrapePage(url_, levelCounter) { 
    // Bale out if : 
    // a) the target url_ has been visited already, 
    // b) maxLevels has been reached. 
    if(urlSet.has(url_) || levelCounter >= maxLevels) { 
     return Promise.resolve(); 
    } 
    urlSet.add(url_); 

    return request.getAsync(url_).then(function(response, html) { 
     var $; 
     if(response.statusCode !== 200) { 
      throw new Error('statusCode was not 200'); // will be caught below 
     } 
     $ = cheerio.load(html); 
     if($('[type=submit]').length > 0) { 
      // yay, it's a product page. 
      tshirtArray.push({ 
       price: $('.price').text(), 
       img: $('.shirt-picture').find("img").attr("src"), 
       title: $('body').find(".shirt-details > h1").text().slice(4), 
       url: url_, 
       date: moment().format('MMMM Do YYYY, h:mm:ss a') 
      }); 
     } 
     // find any shirt links on page represented by $, visit each link in turn, and scrape. 
     return Promise.all($("a[href*=shirt]").map(function(link) { 
      return scrapePage(link.href, levelCounter + 1); 
     }).get()); 
    }).catch(function(e) { 
     // ensure "success" even if scraping threw an error. 
     console.log(e); 
     return null; 
    }); 
} 

scrapePage(url, 0).then(convertJson2Csv); 

正如你所看到的,遞歸的解決方案:

  • 避免碼重複,如你所願
  • 將深入爲多個層級 - 由變量maxLevels確定。

注意:這仍然不是一個好的解決方案。這裏有一個隱含的假設,就像在原始代碼中一樣,所有的襯衫頁面都可以通過網站的主頁,通過單獨的「襯衫」鏈接到達。如果襯衫可以通過例如「服裝」>「襯衫」到達,那麼上面的代碼將找不到任何襯衫。