2015-08-14 75 views
1

我有一個要求來聚合來自幾個不同的網站(主要是HTML頁面和PDF文檔)的內容。我目前正在試驗Heritrix(3.2.0)以查看它是否能滿足我的需求。Heritrix內容過濾

儘管文檔非常詳細,但引擎似乎並沒有像我期望的那樣工作。我已經設置了一些簡單的工作,並配置了很多不同的方式,但不管我做了什麼,我發現Heritrix要麼拉下太多的內容,要麼什麼都不拉。

下面是我想要做的一個例子。我正在將Heritrix指向URL,如... example.com/news/speeches。這是一個帶有HTML表格的網頁,其中包含指向個人演講(ex..example.com/news/speech/speech1.html,xample.com/news/speech/speech2.html等)的鏈接。我真的只需要在父頁面下一級的HTML和PDF文檔。我希望阻止Heritrix進行比1級更深的導航,如果不低於example.com域上的此特定路徑,阻止Heritrix拉動內容,阻止它導航到另一個域,並將其限制爲html和pdf內容。

以下配置是什麼,我覺得應該工作,但不

<bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer"> 
     <property name="properties"> 
     <props> 
     <prop key="seeds.textSource.value"> 

    # URLS HERE 
    example.com/news/speeches 

     </prop> 
     </props> 
     </property> 
    </bean> 

<bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence"> 
    <!-- <property name="logToFile" value="false" /> --> 
    <property name="rules"> 
    <list> 
    <!-- Begin by REJECTing all... --> 
    <bean class="org.archive.modules.deciderules.RejectDecideRule"> 
    </bean> 
    <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... --> 
    <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> 
    <!-- <property name="seedsAsSurtPrefixes" value="true" /> --> 
    <!-- <property name="alsoCheckVia" value="false" /> --> 
    <!-- <property name="surtsSourceFile" value="" /> --> 
    <!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> --> 
     <property name="surtsSource"> 
      <bean class="org.archive.spring.ConfigString"> 
      <property name="value"> 
      <value> 
      example.com/news/speeches 
      </value> 
      </property> 
      </bean> 
      </property> 
    </bean> 
    <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... --> 
<bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 
     <property name="decision" value="REJECT"/> 
     <property name="listLogicalOr" value="true" /> 
     <property name="regexList"> 
     <list> 
     <value>.*(?i)(\.(avi|wmv|mpe?g|mp3))$</value> 
     <value>.*(?i)(\.(rar|zip|tar|gz))$</value> 
     <value>.*(?i)(\.(xls|odt))$</value> 
     <value>.*(?i)(\.(xml))$</value> 
     <value>.*(?i)(\.(txt|conf|pdf))$</value> 
     <value>.*(?i)(\.(swf))$</value> 
     <value>.*(?i)(\.(js|css))$</value> 
     <value>.*(?i)(\.(bmp|gif|jpe?g|png|svg|tiff?))$</value> 
     </list> 
     </property> 
</bean> 
    <!-- ...but REJECT those more than a configured link-hop-count from start... --> 
    <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule"> 
    <!-- <property name="maxHops" value="20" /> --> 
    </bean> 
    <!-- ...but ACCEPT those more than a configured link-hop-count from start... --> 
    <!--bean class="org.archive.modules.deciderules.TransclusionDecideRule"--> 
    <!-- <property name="maxTransHops" value="2" /> --> 
    <!-- <property name="maxSpeculativeHops" value="1" /> --> 
    <!--/bean--> 
    <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... --> 
    <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> 
      <property name="decision" value="REJECT"/> 
      <property name="seedsAsSurtPrefixes" value="false"/> 
      <property name="surtsDumpFile" value="${launchId}/negative-surts.dump" /> 
    <!-- <property name="surtsSource"> 
      <bean class="org.archive.spring.ConfigFile"> 
      <property name="path" value="negative-surts.txt" /> 
      </bean> 
      </property> --> 
    </bean> 
    <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... --> 
    <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 
      <property name="decision" value="REJECT"/> 
    <!-- <property name="listLogicalOr" value="true" /> --> 
    <!-- <property name="regexList"> 
      <list> 
      </list> 
      </property> --> 
    </bean> 
    <!-- ...and REJECT those with suspicious repeating path-segments... --> 
    <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule"> 
    <!-- <property name="maxRepetitions" value="2" /> --> 
    </bean> 
    <!-- ...and REJECT those with more than threshold number of path-segments... --> 
    <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule"> 
    <!-- <property name="maxPathDepth" value="20" /> --> 
    </bean> 
    <!-- ...but always ACCEPT those marked as prerequisitee for another URI... --> 
    <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule"> 
    </bean> 
    <!-- ...but always REJECT those with unsupported URI schemes --> 
    <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule"> 
    </bean> 
    </list> 
    </property> 
</bean> 

我希望我的檢索只拉下十幾HTML文檔作爲是所有的包含/語音通道內。大約半小時後,我停止了抓取,因爲它正在下載800多個文檔,因爲我發現它正在向後移動到父級路徑。我也嘗試了RegEx規則,但沒有運氣。任何幫助,將不勝感激。

回答

0

調試此類問題的一件好事是啓用作用域決策的日誌記錄。 (取消註釋logToFile並將其設置爲true。這將爲您的每個URI提供決策包含或拒絕的規則,因此您將能夠看到您的規則沒有正確配置,並接受應該已被拒絕。