如何在Javascript中添加服務器端延遲for循環？

我在擺弄使用Node.js從電子商務網站刮取數據。我使用Request來檢索頁面的DOM，並使用Cheerio來執行服務器端DOM選擇。如何在Javascript中添加服務器端延遲for循環？

const cheerio = require('cheerio'); 
const request = require('request'); 

// takes a URL, scrapes the page, and returns an object with the data 
let scrapePage = (url) => { 

    return new Promise((resolve, reject) => { 

     request(url, (error, resp, body) => { 

      if(error){ 
       reject(error); 
      }; 

      let $ = cheerio.load(body); 
      let $url = url; 
      let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text(); 

      let obj = { 
       url: $url, 
       price: $price 
      } 

      resolve(obj); 

     }); 

    }); 

}; 

// Runs scrapePage in a loop 
// There is a variable called arrayOfURLs defined elsewhere that contains 100s of URLs 

for(let i = 0; i < arrayOfURLs.length; i++){ 
    scrapePage(arrayOfURLs[i]) 
     .then((obj) => { 
      //write to a file 
     }) 
     .catch((error) => { 
     }) 
};

的問題是，我將請求發送到服務器有時會發回空白數據，我假設，因爲我發送太多的請求，沒有任何形式的停頓。由於JS的異步特性，我很難弄清楚如何在循環的每次迭代之間添加有效的延遲。僅僅以同步的方式添加setTimeOut是不夠的，因爲setTimeOut本身是異步的，並且我在服務器上運行它，所以沒有Window對象。

編輯

上面的代碼是什麼我工作的簡化版本。整個代碼是這樣的：

app.js

const fs = require('fs'); 
const path = 'urls.txt'; 
const path2 = 'results.txt'; 
const scraper = require('./scraper'); 

let scrapePage = (url) => { 
    scraper.scrapePage(url) 
     .then((obj) => { 
      // console.log('obj from the scraper with Promises was received'); 
      // console.log(obj); 
      // console.log('writing obj to a file'); 
      fs.appendFile(path2, JSON.stringify(obj) + ', ', (error) => { 
       if(error){ 
        console.log(error); 
       } else { 
        // console.log('Successfully wrote to ' + path2); 
       } 
      }) 
     }) 
     .catch((error) => { 
      console.log('There was an error scraping obj: '); 
      console.log(error); 
     }) 
} 

fs.readFile(path, 'utf8', (err, data) => { 

    if (err){ 
    throw err; 
    }; 

    var urlArray = JSON.parse(data); 

    // this returns an Unexpected Identifier error  
    // const results = await Promise.all(urlArray.map(scrapePage)); 

    // this returns an Unexpected Token Function error 
    // async function scrapePages(){ 
    // const results = await Promise.all(urlArray.map(scrapePage)); 
    // }; 

});

scraper.js

const request = require('request'); 
const cheerio = require('cheerio'); 

exports.scrapePage = (url) => { 
    return new Promise((resolve, reject) => { 
     request(url, (error, resp, body) => { 
      if(error){ 
       reject(error); 
      }; 

      let $ = cheerio.load(body); 
      let $url = url; 

      let $price = $('#rt-mainbody > div > div.details > div.itemData > div:nth-child(4) > div.description').text(); 

      let obj = { 
       url: $url, 
       price: $price 
      } 

      resolve(obj); 

     }) 
    }) 
}

來源

2017-12-18 fuzzybabybunny

可能重複[什麼是睡眠（）的JavaScript版本？]（https://stackoverflow.com/questions/951021/what-is-the-javascript-version-of-sleep） –

你不能依靠假設數據將在一定時間後出現在那裏。改爲使用回調函數嘗試一種方法。 – Ctznkane525

我會調查這個空白數據問題，至少記錄標題和響應代碼，以便找出錯誤的位置。好像你在猜測，爲什麼在你知道錯誤是什麼之前進行修改？ –

看起來像你對我沒有等待你的承諾，以解決您發送服務器響應之前。您可以使用例如async/await例如完全消除for循環。

const results = await Promise.all(arrayOfURLs.map(scrapePage));

來源

2017-12-18 01:52:44 James

我有一個小的代碼編輯。我已經在等待用'.then'解決的承諾。 'scrapePage'方法返回一個Promise，然後等待它用'.then'解決，之後我將結果寫入一個文件。 – fuzzybabybunny

@fuzzybabybunny是的，所以這對於後處理每個刮都很好，但是我擔心的是你沒有等待所有*在返回HTTP響應之前解決的承諾 - 這是服務器返回「空白」的最可能原因，回覆，因爲它在*刮完成之前返回*。 – James

我編輯了我的原始文章以包含我的實際代碼。我加了一點'await Promise.all'，但是我收到了錯誤 - 它們在我更新後的代碼中描述。我在運行一個方法的回調函數中運行'scrapePage'函數來從文本文件中讀取URL。 – fuzzybabybunny

如果您希望不超過x個活動連接數，您可以使用throttle。或者，如果您希望每秒不超過x數量，則可以使用throttlePeriod。

使用Promise.all絕不會叫你的決心處理程序，如果只有一個請求失敗，所以你可以捕捉任何錯誤，並返回一個失敗對象

const Fail = function(details){this.details=details;}; 
const max10 = throttle(10)(scrapePage);//max 10 active connections 
//const fivePerSecond = throttlePeriod(2,1000)(scrapePage); //start no more than 2 per second 
Promise.all(
    arrayOfURLs.map(
    url => 
     max10(url) 
     .catch(err=>new Fail([err,url])) 
) 
) 
.then(
    results =>{ 
    successes = 
     results.filter(
     result=>(result&&result.constructor)!==Fail 
    ); 
    failed = 
     results.filter(
     result=>(result&&result.constructor)===Fail 
    ) 
    } 
);

來源

2017-12-18 03:41:31 HMR

const cheerio = require('cheerio'); 
const request = require('request'); 
let scrapePage = (url) => { 

return new Promise((resolve, reject) => { 

    request(url, (error, resp, body) => { 

     if(error){ 
      reject(error); 
      return; 
     }; 

     if(!body) { 
      reject('Empty Body'); 
      return; 
     } 


     let $ = cheerio.load(body); 

     let $url = url; 
     let $price = $('#rt-mainbody > div > div.details > div.Data > div:nth-child(4) > div.description').text(); 

     let obj = { 
      url: $url, 
      price: $price 
     } 

     resolve(obj); 

    }); 

}); 
}; 

function processUrl(url){ 
scrapePage(url) 
    .then((obj) => { 
     //write to a file 
     if(i < arrayOfURLs.length) 
      processUrl(arrayOfURLs.pop()) 
    }) 
    .catch((error) => { 
     arrayOfURLs.unshift(url); 
     if(i < arrayOfURLs.length) // put this in finally block 
      processUrl(arrayOfURLs.pop()) 
    }) 
}; 
processUrl(arrayOfURLs.pop());

這裏我們可以使用arrayOfUrls數組作爲隊列，如果我們收到了一個錯誤或空白頁面，我們再次將此URL放入數組中。這樣我們就可以以同步的方式處理每個URL。

來源

2017-12-18 06:07:04

這看起來像一個很好的6+解決方案。我會考慮使用'＃array.pop'來代替索引。 – pguardiario

如何在Javascript中添加服務器端延遲for循環？

回答

相關問題