1
我嘗試用ABOT抓取網站地圖。我從here激發我的代碼。用ABOT抓取網站地圖
抓取頁面完成後,內容文本爲空(e.CrawledPage
,Crawler_PageCrawlCompleted
)。另外,SiteMapFinder.GetLinks從未碰過。
請指教我,我的問題在哪裏。
using Abot.Core;
using Abot.Crawler;
using Abot.Poco;
using CsQuery.ExtensionMethods;
using System;
using System.Collections.Generic;
namespace WebCrawler
{
public class SiteMapFinder : IHyperLinkParser
{
private readonly HyperLinkParser _linkParser;
public SiteMapFinder()
{
_linkParser = new AngleSharpHyperlinkParser();
}
IEnumerable<Uri> IHyperLinkParser.GetLinks(CrawledPage crawledPage)
{
if (crawledPage.HttpWebResponse.ContentType == "text/xml")
{
Console.WriteLine(crawledPage.Uri.AbsoluteUri);
}
return _linkParser.GetLinks(crawledPage);
}
}
class Program
{
static void Main(string[] args)
{
SiteMapFinder finder = new SiteMapFinder();
PoliteWebCrawler crawler = new PoliteWebCrawler(null, null, null, null, null, finder, null, null, null);
crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted;
CrawlResult result = crawler.Crawl(new Uri("http://www.example.com/sitemap/"));
}
private static void Crawler_PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
{
Console.WriteLine(e.CrawledPage.Uri.AbsoluteUri);
e.CrawledPage.HttpWebResponse.Headers.AllKeys.ForEach(k => Console.WriteLine($"{k}: {e.CrawledPage.HttpWebResponse.Headers[k]}"));
}
}
}