我在Apache Spark上運行UIMA應用程序。有數百萬個頁面進入批處理,由UIMA RUTA進行處理。但有一段時間我面臨內存異常。它有時會拋出異常,因爲它成功處理了頁面,但在頁面上有一些時間失敗。Uima Ruta在火花上下文中的內存不足問題
應用程序日誌
Caused by: java.lang.OutOfMemoryError: Java heap space
at org.apache.uima.internal.util.IntArrayUtils.expand_size(IntArrayUtils.java:57)
at org.apache.uima.internal.util.IntArrayUtils.ensure_size(IntArrayUtils.java:39)
at org.apache.uima.cas.impl.Heap.grow(Heap.java:187)
at org.apache.uima.cas.impl.Heap.add(Heap.java:241)
at org.apache.uima.cas.impl.CASImpl.ll_createFS(CASImpl.java:2844)
at org.apache.uima.cas.impl.CASImpl.createFS(CASImpl.java:489)
at org.apache.uima.cas.impl.CASImpl.createAnnotation(CASImpl.java:3837)
at org.apache.uima.ruta.rule.RuleMatch.getMatchedAnnotations(RuleMatch.java:172)
at org.apache.uima.ruta.rule.RuleMatch.getMatchedAnnotationsOf(RuleMatch.java:68)
at org.apache.uima.ruta.rule.RuleMatch.getLastMatchedAnnotation(RuleMatch.java:73)
at org.apache.uima.ruta.rule.ComposedRuleElement.mergeDisjunctiveRuleMatches(ComposedRuleElement.java:330)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueMatch(ComposedRuleElement.java:213)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueOwnMatch(ComposedRuleElement.java:362)
at org.apache.uima.ruta.rule.ComposedRuleElement.fallbackContinue(ComposedRuleElement.java:459)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueMatch(ComposedRuleElement.java:225)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueOwnMatch(ComposedRuleElement.java:362)
at org.apache.uima.ruta.rule.ComposedRuleElement.fallbackContinue(ComposedRuleElement.java:459)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueMatch(ComposedRuleElement.java:225)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueOwnMatch(ComposedRuleElement.java:362)
at org.apache.uima.ruta.rule.ComposedRuleElement.fallbackContinue(ComposedRuleElement.java:459)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueMatch(ComposedRuleElement.java:225)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueOwnMatch(ComposedRuleElement.java:362)
at org.apache.uima.ruta.rule.ComposedRuleElement.fallbackContinue(ComposedRuleElement.java:459)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueMatch(ComposedRuleElement.java:225)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueOwnMatch(ComposedRuleElement.java:362)
at org.apache.uima.ruta.rule.ComposedRuleElement.fallbackContinue(ComposedRuleElement.java:459)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueMatch(ComposedRuleElement.java:225)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueOwnMatch(ComposedRuleElement.java:362)
at org.apache.uima.ruta.rule.ComposedRuleElement.fallbackContinue(ComposedRuleElement.java:459)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueMatch(ComposedRuleElement.java:225)
at org.apache.uima.ruta.rule.ComposedRuleElement.continueOwnMatch(ComposedRuleElement.java:362)
at org.apache.uima.ruta.rule.ComposedRuleElement.fallbackContinue(ComposedRuleElement.java:459)
UIMA RUTA SCRIPT
WORDLIST EnglishStopWordList = 'stopWords.txt';
WORDLIST FiltersList = 'AnchorFilters.txt';
DECLARE Filters, EnglishStopWords;
DECLARE Anchors, SpanStart,SpanClose;
DocumentAnnotation{-> ADDRETAINTYPE(MARKUP)};
DocumentAnnotation{-> MARKFAST(Filters, FiltersList)};
STRING MixCharacterRegex = "[0-9]+[a-zA-Z]+";
DocumentAnnotation{-> MARKFAST(EnglishStopWords, EnglishStopWordList,true)};
(SW | CW | CAP) { -> MARK(Anchors, 1, 2)};
Anchors{CONTAINS(EnglishStopWords) -> UNMARK(Anchors)};
(SPECIAL{REGEXP("['\"-=()\\[\\]]")}| PM) (SW | CW | CAP) (SPECIAL{REGEXP("['\"-=()\\[\\]]")}| PM) EnglishStopWords? { -> MARK(Anchors, 1, 4)};
(SPECIAL{REGEXP("['\"-=()\\[\\]]")}| PM)? (SW | CW | CAP) (SPECIAL{REGEXP("['\"-=()\\[\\]]")}| PM) EnglishStopWords? { -> MARK(Anchors, 1, 4)};
(SPECIAL{REGEXP("['\"-=()\\[\\]]")}| PM) (SW | CW | CAP) (SPECIAL{REGEXP("['\"-=()\\[\\]]")}| PM)? EnglishStopWords? { -> MARK(Anchors, 1, 4)};
(SW | CW | CAP) (SPECIAL{REGEXP("['\"-=()\\[\\]]")}| PM) EnglishStopWords? { -> MARK(Anchors, 1, 3)};
Anchors{CONTAINS(MARKUP) -> UNMARK(Anchors)};
MixCharacterRegex -> Anchors;
"<Value>" -> SpanStart;
"</Value>" -> SpanClose;
Anchors{-> CREATE(ExtractedData, "type" = "ANCHOR", "value" = Anchors)};
SpanStart Filters? SPACE? ExtractedData SPACE? Filters? SpanClose{-> GATHER(Data, 2, 6, "ExtractedData" = 4)};
你可以添加一個鏈接到(模擬)文件,該文件會導致這樣的問題? –
你使用哪種ruta版本? –