2012-07-05 102 views
124

我使用PhantomJS v1.4.1加載一些網頁。我沒有訪問他們的服務器端,我只是得到指向他們的鏈接。我使用Phantom的過時版本,因爲我需要在該網頁上支持Adobe Flash。phantomjs不等待「全部」頁面加載

問題是許多網站正在加載他們的次要內容異步,這就是爲什麼Phantom的onLoadFinished回調(類似於HTML中的onLoad)在未加載任何東西時觸發得太早。任何人都可以建議我如何等待網頁的全部加載,例如,所有動態內容(如廣告)的截圖?

回答

12

也許你可以使用onResourceRequested and onResourceReceived callbacks來檢測異步加載。下面是使用這些回調from their documentation的例子:

var page = require('webpage').create(); 
page.onResourceRequested = function (request) { 
    console.log('Request ' + JSON.stringify(request, undefined, 4)); 
}; 
page.onResourceReceived = function (response) { 
    console.log('Receive ' + JSON.stringify(response, undefined, 4)); 
}; 
page.open(url); 

此外,你可以看看examples/netsniff.js的工作示例。

+0

但在這種情況下,我不能使用PhantomJS的一個實例一次加載多個頁面,對嗎? – nilfalse

+0

onResourceRequested是否適用於AJAX /跨域請求?還是它只適用於像CSS,圖像..等? – CMCDragonkai

+0

@CMCDragonkai我從來沒有使用過它,但基於[this](https://github.com/ariya/phantomjs/wiki/Network-Monitoring),它似乎包含所有請求。 Quote:'所有的資源請求和響應可以使用onResourceRequested和onResourceReceived嗅探' – Supr

18

你可以嘗試的WAITFOR和光柵化實例的組合:

/** 
* See https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js 
* 
* Wait until the test condition is true or a timeout occurs. Useful for waiting 
* on a server response or for a ui change (fadeIn, etc.) to occur. 
* 
* @param testFx javascript condition that evaluates to a boolean, 
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or 
* as a callback function. 
* @param onReady what to do when testFx condition is fulfilled, 
* it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or 
* as a callback function. 
* @param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used. 
*/ 
function waitFor(testFx, onReady, timeOutMillis) { 
    var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s 
     start = new Date().getTime(), 
     condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()), //< defensive code 
     interval = setInterval(function() { 
      if ((new Date().getTime() - start < maxtimeOutMillis) && !condition) { 
       // If not time-out yet and condition not yet fulfilled 
       condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code 
      } else { 
       if(!condition) { 
        // If condition still not fulfilled (timeout but condition is 'false') 
        console.log("'waitFor()' timeout"); 
        phantom.exit(1); 
       } else { 
        // Condition fulfilled (timeout and/or condition is 'true') 
        console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms."); 
        typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled 
        clearInterval(interval); //< Stop this interval 
       } 
      } 
     }, 250); //< repeat check every 250ms 
}; 

var page = require('webpage').create(), system = require('system'), address, output, size; 

if (system.args.length < 3 || system.args.length > 5) { 
    console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]'); 
    console.log(' paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"'); 
    phantom.exit(1); 
} else { 
    address = system.args[1]; 
    output = system.args[2]; 
    if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") { 
     size = system.args[3].split('*'); 
     page.paperSize = size.length === 2 ? { 
      width : size[0], 
      height : size[1], 
      margin : '0px' 
     } : { 
      format : system.args[3], 
      orientation : 'portrait', 
      margin : { 
       left : "5mm", 
       top : "8mm", 
       right : "5mm", 
       bottom : "9mm" 
      } 
     }; 
    } 
    if (system.args.length > 4) { 
     page.zoomFactor = system.args[4]; 
    } 
    var resources = []; 
    page.onResourceRequested = function(request) { 
     resources[request.id] = request.stage; 
    }; 
    page.onResourceReceived = function(response) { 
     resources[response.id] = response.stage; 
    }; 
    page.open(address, function(status) { 
     if (status !== 'success') { 
      console.log('Unable to load the address!'); 
      phantom.exit(); 
     } else { 
      waitFor(function() { 
       // Check in the page if a specific element is now visible 
       for (var i = 1; i < resources.length; ++i) { 
        if (resources[i] != 'end') { 
         return false; 
        } 
       } 
       return true; 
      }, function() { 
       page.render(output); 
       phantom.exit(); 
      }, 10000); 
     } 
    }); 
} 
+3

似乎它不適用於使用任何服務器推送技術的網頁,因爲資源會onLoad發生後仍然在使用。 – nilfalse

+0

做任何驅動程序,例如。 [poltergeist](https://github.com/jonleighton/poltergeist),有這樣的功能? –

+0

是否可以使用waitFor輪詢整個HTML文本並搜索定義的關鍵字?我試圖實現這一點,但似乎輪詢不會刷新到最新下載的HTML源代碼。 – fpdragon

66

另一種方法是隻問PhantomJS等待了一下頁面已經做渲染之前加載後,按常規rasterize.js例如,但較長時間的超時允許JavaScript來完成加載額外的資源:

page.open(address, function (status) { 
    if (status !== 'success') { 
     console.log('Unable to load the address!'); 
     phantom.exit(); 
    } else { 
     window.setTimeout(function() { 
      page.render(output); 
      phantom.exit(); 
     }, 1000); // Change timeout as required to allow sufficient time 
    } 
}); 
+1

是的,目前我堅持這種方法。 – nilfalse

+7

您應該接受回覆 – alex88

+89

這是一個可怕的解決方案,對不起(這是PhantomJS的錯!)。如果等待一秒鐘,但需要20ms加載,這完全浪費時間(想想批處理作業),或者如果花費時間超過一秒鐘,它仍然會失敗。專業工作無法忍受這種無效率和不可靠性。 – CoDEmanX

13

在我的計劃,我用一些邏輯來判斷,如果它是有載:看它的網絡請求,如果沒有新的請求在過去的200毫秒,我tre在它上載。

在onLoadFinish()之後使用這個。

function onLoadComplete(page, callback){ 
    var waiting = []; // request id 
    var interval = 200; //ms time waiting new request 
    var timer = setTimeout(timeout, interval); 
    var max_retry = 3; // 
    var counter_retry = 0; 

    function timeout(){ 
     if(waiting.length && counter_retry < max_retry){ 
      timer = setTimeout(timeout, interval); 
      counter_retry++; 
      return; 
     }else{ 
      try{ 
       callback(null, page); 
      }catch(e){} 
     } 
    } 

    //for debug, log time cost 
    var tlogger = {}; 

    bindEvent(page, 'request', function(req){ 
     waiting.push(req.id); 
    }); 

    bindEvent(page, 'receive', function (res) { 
     var cT = res.contentType; 
     if(!cT){ 
      console.log('[contentType] ', cT, ' [url] ', res.url); 
     } 
     if(!cT) return remove(res.id); 
     if(cT.indexOf('application') * cT.indexOf('text') != 0) return remove(res.id); 

     if (res.stage === 'start') { 
      console.log('!!received start: ', res.id); 
      //console.log(JSON.stringify(res)); 
      tlogger[res.id] = new Date(); 
     }else if (res.stage === 'end') { 
      console.log('!!received end: ', res.id, (new Date() - tlogger[res.id])); 
      //console.log(JSON.stringify(res)); 
      remove(res.id); 

      clearTimeout(timer); 
      timer = setTimeout(timeout, interval); 
     } 

    }); 

    bindEvent(page, 'error', function(err){ 
     remove(err.id); 
     if(waiting.length === 0){ 
      counter_retry = 0; 
     } 
    }); 

    function remove(id){ 
     var i = waiting.indexOf(id); 
     if(i < 0){ 
      return; 
     }else{ 
      waiting.splice(i,1); 
     } 
    } 

    function bindEvent(page, evt, cb){ 
     switch(evt){ 
      case 'request': 
       page.onResourceRequested = cb; 
       break; 
      case 'receive': 
       page.onResourceReceived = cb; 
       break; 
      case 'error': 
       page.onResourceError = cb; 
       break; 
      case 'timeout': 
       page.onResourceTimeout = cb; 
       break; 
     } 
    } 
} 
47

我寧願定期檢查document.readyState狀態(https://developer.mozilla.org/en-US/docs/Web/API/document.readyState)。雖然這種方法有點笨拙,但您可以確定在onPageReady函數中使用了完全加載的文檔。

var page = require("webpage").create(), 
    url = "http://example.com/index.html"; 

function onPageReady() { 
    var htmlContent = page.evaluate(function() { 
     return document.documentElement.outerHTML; 
    }); 

    console.log(htmlContent); 

    phantom.exit(); 
} 

page.open(url, function (status) { 
    function checkReadyState() { 
     setTimeout(function() { 
      var readyState = page.evaluate(function() { 
       return document.readyState; 
      }); 

      if ("complete" === readyState) { 
       onPageReady(); 
      } else { 
       checkReadyState(); 
      } 
     }); 
    } 

    checkReadyState(); 
}); 

附加說明:

使用嵌套setTimeout代替setInterval防止checkReadyState從「重疊」和競態條件時其執行被延長一段隨機的原因。 setTimeout的默認延遲爲4ms(https://stackoverflow.com/a/3580085/1011156),因此活動輪詢不會對程序性能造成嚴重影響。

document.readyState === "complete"表示文檔已完全加載所有資源(https://html.spec.whatwg.org/multipage/dom.html#current-document-readiness)。

+4

setTimeout的註釋vs setInterval很棒。 –

+0

'readyState'將只在DOM觸發已經滿載,但是任何''