2016-01-23 66 views
0

我試圖抓取一個網站,以獲取它的數據。到目前爲止,我得到它至少連接到網站,但現在當我嘗試設置與數據的文本框的文本,我只是得到了一堆:HtmlAgilityPack XPath返回HtmlAgilityPack.HtmlNodeCollection

HtmlAgilityPack.HtmlNodeCollection 

有相同數量的有數據的HtmlAgilityPack.HtmlNodeCollection。這裏是我的代碼(我知道這是有點馬虎):

using System.Collections.Generic; 
using System.Linq; 
using System.Net; 
using System.Text.RegularExpressions; 
using System.Windows.Forms; 
using System; 
using HtmlAgilityPack; 

namespace WindowsFormsApplication1 
{ 
public partial class Form1 : Form 
{ 
    string choice; 

    public Form1() 
    { 
     InitializeComponent(); 
    } 

    public void comboBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 

    } 

    public void button1_Click(object sender, System.EventArgs e) 
    { 
     HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); 
     htmlDoc.OptionFixNestedTags = true; 

     string urlToLoad = "http://www.nbcwashington.com/weather/school-closings/"; 
     HttpWebRequest request = HttpWebRequest.Create(urlToLoad) as HttpWebRequest; 
     request.Method = "GET"; 

     Console.WriteLine(request.RequestUri.AbsoluteUri); 
     WebResponse response = request.GetResponse(); 

     htmlDoc.Load(response.GetResponseStream(), true); 
     if (htmlDoc.DocumentNode != null) 
     { 
      var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p"); 



      if (articleNodes != null && articleNodes.Any()) 
      { 
       foreach (var articleNode in articleNodes) 
       { 

        textBox1.AppendText(htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p").ToString()); 

       } 
      } 
     } 

     Console.ReadLine(); 
    } 

    private void listBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 
     choice = listBox1.SelectedItem.ToString(); 
    } 



} 
} 

所以我在這裏錯過/做錯了什麼?數據應該返回類似於:

Warren County Public Schools Closed 
Washington Adventist University Closing at Noon 

感謝您關注此問題。

+0

轉換爲字符串的反應,看HTML,我們會告訴你什麼是錯的。此外,大多數這種檢查爲零是不需要的。 – mybirthname

回答

0

沒關係,找到了問題。我想我試圖抓住文檔節點而不是內部文本...這裏是代碼,以防萬一任何人想要它。

using System.Collections.Generic; 
using System.Linq; 
using System.Net; 
using System.Text.RegularExpressions; 
using System.Windows.Forms; 
using System; 
using HtmlAgilityPack; 

namespace WindowsFormsApplication1 
{ 
public partial class Form1 : Form 
{ 
    string choice; 

    public Form1() 
    { 
     InitializeComponent(); 
    } 

    public void comboBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 

    } 

    public void button1_Click(object sender, System.EventArgs e) 
    { 
     HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); 
     htmlDoc.OptionFixNestedTags = true; 

     string urlToLoad = "http://www.nbcwashington.com/weather/school-closings/"; 
     HttpWebRequest request = HttpWebRequest.Create(urlToLoad) as HttpWebRequest; 
     request.Method = "GET"; 

     Console.WriteLine(request.RequestUri.AbsoluteUri); 
     WebResponse response = request.GetResponse(); 

     htmlDoc.Load(response.GetResponseStream(), true); 
     if (htmlDoc.DocumentNode != null) 
     { 
      var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p"); 



      if (articleNodes != null && articleNodes.Any()) 
      { 
       int k = 0; 
       foreach (var articleNode in articleNodes) 
       { 


        textBox1.AppendText(articleNode.InnerText + "\n"); 

       } 
      } 
     } 

     Console.ReadLine(); 
    } 

    private void listBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 
     choice = listBox1.SelectedItem.ToString(); 
    } 



} 

}

0

由於articleNodes已經包含了你感興趣的節點,就沒有必要再打電話SelectNodes()一個循環中。

此外,您不需要檢查null,因爲articleNodes是一個集合。它可能是空,但不應該是null

試試這個,訪問InnerHtml(或InnerText)屬性,而不是:

var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p"); 

var result = articleNodes.Select(x => x.InnerHtml.Replace("<br><span>", " ") 
               .Replace(" </span>", "")).ToList();