2012-06-19 58 views
0

我有以下節點(s),我在流讀取器中檢索。可能有很多這些。我只想檢索此節點中的幾個組,例如REPLICATE_ID, ASSAY_NUMBER,FEW DATES FIELDS如何忽略組

節點中字段的排序可能不同,有時候也可能會出現新字段,但我想提取的字段不會更改。

到目前爲止,正則表達式我匹配整個節點,所以如果節點有新字段或順序不同,它會中斷。是否可以匹配我只感興趣的組?

TEST_REPLICATE 
    { 
     REPLICATE_ID   453w 
     ASSAY_NUMBER   334 
     ASSAY_VERSION   4 
     ASSAY_STATUS   test 
     DILUTION_ID   1 
     SAMPLE_ID   "NC_dede" 
     SAMPLE_TYPE   Specimen 
     TEST_ORDER_DATE   05.23.2012 
     TEST_ORDER_TIME   04:25:07 
     TEST_INITIATION_DATE  05.23.2012 
     TEST_INITIATION_TIME  05:19:43 
     TEST_COMPLETION_DATE  05.23.2012 
     TEST_COMPLETION_TIME  05:48:01 
     ASSAY_CALIBRATION_DATE  NA 
     ASSAY_CALIBRATION_TIME  NA 
     TRACK   1 
     PROCESSING_LANE  1 
     MODULE_SN  "EP004" 
     LOAD_LIST_NAME   C:\BwedwQwedw_SCC\edwLoadlist2RACKSB.json 
     OPERATOR_ID   "Q_dwe" 
     DARK_SUBREADS   16 23 19 20 16 18 21 16 17 18 19 19 20 22 19 20 19 20 18 20 17 20 21 16 19 23 20 22 19 20 
     SIGNAL_SUBREADS   18 17 20 21 42 61 41 31 30 30 26 26 25 22 24  DARK_COUNT   577 
     SIGNAL_COUNT   781 
     CORRECTED_COUNT   204 
     STD_BAK    1.95965044971226 
     AVG_BAK    19.2333333333333 
     STD_FOR    8.67212471810898 
     AVG_FOR    26.0333333333333 
     SHAPE    NA 
     EXCEPTION_STRING  TestException - Parameters:Unable to process test, background read failure. 
     RESULT    NA 
     REPORTED_RESULT   NA 
     REPORTED_RESULT_UNITS  NA 
     REAGENT_MASTER_LOT  13600LI02 
     REAGENT_SERIAL_NUMBER  25022 
     RESULT_FLAGS   RUO 
     RESULT_INTERPRETATION  NA 
     DILUTION_PROTOCOL  UNDILUTED 
     RESULT_COMMENT   frer 1 LANE A 
     DATA_MANAGEMENT_FIELD_1  NA 
     DATA_MANAGEMENT_FIELD_2  NA 
     DATA_MANAGEMENT_FIELD_3  NA 
     DATA_MANAGEMENT_FIELD_4  NA 
    } 

    string pat = @"TEST_REPLICATE\s*{\s*REPLICATE_ID\s*([^}]*?)\s+ASSAY_NUMBER\s*([^}]*?)\s+ASSAY_VERSION\s*([^}]*?)\s+DILUTION_ID\s*([^}]*?)\s+SAMPLE_ID\s*([^}]*?)\s+SAMPLE_TYPE\s*([^}]*?)\s+TEST_ORDER_DATE\s*([^}]*?)\s+TEST_ORDER_TIME\s*([^}]*?)\s+TEST_INITIATION_DATE\s*([^}]*?)\s+TEST_INITIATION_TIME\s*([^}]*?)\s+TEST_COMPLETION_DATE\s*([^}]*?)\s+TEST_COMPLETION_TIME\s*([^}]*?)\s+ASSAY_CALIBRATION_DATE\s*([^}]*?)\s+ASSAY_CALIBRATION_TIME\s*([^}]*?)\s+TRACK\s*([^}]*?)\s+PROCESSING_LANE\s*([^}]*?)\s+MODULE_SN\s*([^}]*?)\s+LOAD_LIST_NAME\s*([^}]*?)\s+OPERATOR_ID\s*([^}]*?)\s+DARK_SUBREADS\s*([^}]*?)\s+SIGNAL_SUBREADS\s*([^}]*?)\s+DARK_COUNT\s*([^}]*?)\s+SIGNAL_COUNT\s*([^}]*?)\s+CORRECTED_COUNT\s*([^}]*?)\s+STD_BAK\s*([^}]*?)\s+AVG_BAK\s*([^}]*?)\s+STD_FOR\s*([^}]*?)\s+AVG_FOR\s*([^}]*?)\s+SHAPE\s*([^}]*?)\s+EXCEPTION_STRING\s*([^}]*?)\s+RESULT\s*([^}]*?)\s+REPORTED_RESULT\s*([^}]*?)\s+REPORTED_RESULT_UNITS\s*([^}]*?)\s+REAGENT_MASTER_LOT\s*([^}]*?)\s+REAGENT_SERIAL_NUMBER\s*([^}]*?)\s+RESULT_FLAGS\s*([^}]*?)\s+RESULT_INTERPRETATION\s*([^}]*?)\s+DILUTION_PROTOCOL\s*([^}]*?)\s+RESULT_COMMENT\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_1\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_2\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_3\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_4\s*([^}]*?)\s*}"; 
+3

停止這種瘋狂,只是解析正確的結構。一行有一個鍵/名稱,後跟空格和一個值(行的其餘部分)。迭代這些行並將其全部解析爲散列/字典或保存您所需的任何內容。 – Qtax

回答

0

是的,你可能應該只是解析鍵值對的記錄。

這是一個代碼示例,如果您想從記錄中提取鍵值對。
如果找到匹配項,則可以針對捕獲集合中的鍵對您所需的鍵進行測試。

你也可以改變正則表達式如何允許記錄的開始/結束。
但是不要改變核心,它可以防止災難性的回溯。

正則表達式的替代品:

# Record starts on a new line, closing brace can be anywhere 

^ [^\S\n]*TEST_REPLICATE\s*\{ 
(?> 
     \s* (?<key> [^\s{}]+) [^\S\n]* (?<val> [^\n{}]*?) [^\S\n]* (?:$|(?=\})) 
)* 
\s*\} 


# Record starts anywhere, closing brace is on a new line 

TEST_REPLICATE\s*\{ 
(?> 
     \s* (?<key> [^\s{}]+) [^\S\n]* (?<val> [^\n{}]*?) [^\S\n]* $ 
)* 
\s*\} 

C#測試代碼:

Regex testRx = new Regex(
@" 
^[^\S\n]* TEST_REPLICATE  # Record, starts on a newline 
    \s*       # Optional whitespaces (trims blank lines) 
    \{       # Record opening brace 
     (?>       # Atomic group 
     \s*       # Optional many whitespace (trims blank lines) 
     # Line in record to be recorded 
     (?<key> [^\s{}]+)    # required <key>, not whitespacs nor braces 
     [^\S\n]*       # trim whitespaces (don't include newline) 
     (?<val> [^\n{}]*?)    # optional <value>, not newlines nor braces 
     [^\S\n]*       # trim whitespaces (don't include newline) 
     (?:$|(?=\}))     # End of line, or next char is a closing brace 
    )*       # End atomic group, do many times (optional) 
    \s*       # Optional whitespaces (trims blank lines) 
    \}       # Record closing brace 
", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); 

string testdata = @" 
TEST_REPLICATE{} 
TEST_REPLICATE{ 
    REPLICATE_ID   1asdf985 
    ASSAY_NUMBER   123sdg 
    ASSAY_VERSION   4sdgn 
    ASSAY_TYPE   unknown 
} 

TEST_REPLICATE 
{ 
    REPLICATE_ID    
    ASSAY_NUMBER   123  
    ASSAY_VERSION   4 
    ASSAY_TYPE   unknown 
    DILUTION_ID   1 
    SAMPLE_ID   ""NC_HIV1"" 
    SAMPLE_TYPE   Specimen 
    TEST_ORDER_DATE   05.21.2012 
    TEST_ORDER_TIME   03:44:01 
    TEST_INITIATION_DATE  05.21.2012 
    TEST_INITIATION_TIME  04:03:36 

TEST_COMPLETION_DATE  05.21.2012 
TEST_COMPLETION_TIME  04:29:32 
    ASSAY_CALIBRATION_DATE    NA 
    ASSAY_CALIBRATION_TIME  NA 
    TRACK   1 
    PROCESSING_LANE  1 
    MODULE_SN  ""EP004"" 
    LOAD_LIST_NAME   C:\sdddd 
    OPERATOR_ID   ""Q_SI"" 
    DARK_SUBREADS   NA 
    SIGNAL_SUBREADS   NA 
    DARK_COUNT   NA 
    SIGNAL_COUNT   NA 
    CORRECTED_COUNT   NA 
    STD_BAK    NA 
    AVG_BAK    NA 
    STD_FOR    NA 
    AVG_FOR    NA 
    SHAPE    NA 
    EXCEPTION_STRING  Test execution was stopped. 
    RESULT    NA 
    REPORTED_RESULT   NA 
    REPORTED_RESULT_UNITS  NA 
    REAGENT_MASTER_LOT  2345 
    REAGENT_SERIAL_NUMBER  25022 
    RESULT_FLAGS   NA 
    RESULT_INTERPRETATION  NA 
    DILUTION_PROTOCOL  UNDILUTED 
    RESULT_COMMENT   HIV NC 1 
    DATA_MANAGEMENT_FIELD_1  NA 
    DATA_MANAGEMENT_FIELD_2  NA 
    DATA_MANAGEMENT_FIELD_3  NA 
    DATA_MANAGEMENT_FIELD_4  NA 
} 
    "; 

Match m_testrec = testRx.Match(testdata); 

// Each match contains a single record 
// 
while (m_testrec.Success) 
{ 
    Console.WriteLine("New Record\n------------------------"); 

    CaptureCollection cc_key = m_testrec.Groups["key"].Captures; 
    CaptureCollection cc_val = m_testrec.Groups["val"].Captures; 

    for (int i = 0; i < cc_key.Count; i++) 
    { 
     Console.WriteLine("'{0}' = '{1}'", cc_key[i].Value, cc_val[i].Value);                         
     // 
     // Test specific keys here 
     // if (cc_key[i].Value == "REAGENT_SERIAL_NUMBER") ... 

    } 
    Console.WriteLine("------------------------"); 

    // Get next record 
    m_testrec = m_testrec.NextMatch(); 
}