2016-12-02 96 views
0

好了,所以我用phantomjs經歷,我想phantomjs加載和各保存爲HTML文件的URL的數組數組的URL。phantomjs保存不工作

到目前爲止我的代碼有點工作。 它開始加載每個頁面並保存,但在如此多的頁面之後,它突然停止加載新頁面,並且一次又一次保存相同的頁面。 我相信它的部分原因是我沒有使用page.close(),但是當我做的代碼根本不工作。

我一直在尋找某人的幫助,特別是對於什麼是問題的原因的解釋。如果有人有我的問題的解決方案,將不勝感激。我現在知道它很混亂,但是在我發現問題後,我正在等待清理。

var fs = require('fs'); 

/* this is used get an array of urls I'm trying to find. 
function linkfinder(){ 
var array = fs.read('C:\\Users\\jacob\\Documents\\SDD\\links.txt').toString().split('\n'); 
console.log(array[1]); 
console.log('ffff'); 
return array; 
}*/ 

var urls = { 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1476, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1548, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1781, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1506, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1321, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1390, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1430, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1707, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1477, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1431, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1678, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1409, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1239, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1765, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=2203, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1889, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=2240, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1650, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1490, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1514, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1322, 
http://www.njcaa.org/member_colleges/college-profile?collegeId=1744 
} 

    var page = new WebPage(); 
    function handle_page(url){ 
    page.open(url, function(){ 
    //... 
    page.injectJs('jquery.min.js'); 
    // var html = page.evaluate(function(){ 
     // return document.getElementsByTagName('html')[0].innerHTML 
    // }); 
    //save to file 
    page.onLoadFinished = function() { 
    console.log("page load finished"); 
    var path ='C:\\Users\\jacob\\Documents\\SDD\\schools\\.html'; 
    var linked = url.substr(63, 4); 
    var output = [path.slice(0, 37), linked, path.slice(37)].join(''); 
    console.log(output); 
    //page.render('C:\Users\jacob\Documents\export.png'); 

    fs.write(output, page.content, 'w'); 

}; 

    // page.close(); 

    next_page(); 
}); 

} 

function next_page(){ 
var url = links.shift(); 

if(!url){ 
    phantom.exit(0); 
} 
handle_page(url); 
} 

next_page(); 

回答

0

這是可行的,但你需要指定正確的道路(我的工作在Linux上,/root/pjs我路徑有)。

var page = require('webpage').create(), fs = require('fs'); 

page.onLoadFinished = function() {}// won't work at all. The same content/every page((
var urls = [//an array 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1476", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1548", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1781", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1506", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1321", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1390", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1430", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1707", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1477", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1431", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1678", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1409", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1239", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1765", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=2203", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1889", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=2240", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1650", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1490", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1514", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1322", 
"http://www.njcaa.org/member_colleges/college-profile?collegeId=1744" 
] 
var i=0; 
function on_a_page(status){i++; 
console.log("page load finished"); 
var output = '/root/pjs/'+page.url.substr(63, 4)+'.html';// You need to specify the right path (i'm working on Linux, '/root/pjs' my path there). 
console.log(output); 
fs.write(output, page.content, 'w'); 

if(i<urls.length){to_open()}else{phantom.exit()} 
} 
function to_open(){ page.open(urls[i], on_a_page);console.log(i) } 
to_open()