2012-01-10 60 views
6

首先讓我爲這個問題的規模道歉,但我真的想在功能上考慮,這是我不得不合作的更具挑戰性的問題之一。如何寫一個功能文件「掃描儀」

我想獲得一些建議,說明如何以功能方式處理問題,特別是在F#中。我正在編寫一個程序來檢查目錄列表,並使用正則表達式列表來過濾從目錄中檢索到的文件列表,並使用第二個正則表達式列表來查找被檢索文件的文本中的匹配項。我希望這件事能夠爲每個匹配給定正則表達式模式的文本返回文件名,行索引,列索引,模式和匹配值。另外,需要記錄異常,並且有3種可能的異常情況:無法打開目錄,無法打開文件,從文件中讀取內容失敗。這樣做的最終要求是「掃描」匹配的文件量可能非常大,所以整個事情需要懶散。我並不太擔心「純」功能解決方案,因爲我對「良好」的解決方案很感興趣,而且解決方案的讀取效果很好,並且運行良好。最後一個挑戰是使它與C#互操作,因爲我想使用winform工具將此算法附加到UI。這是我的第一次嘗試,希望這將澄清問題:

open System.Text.RegularExpressions 
open System.IO 

type Reader<'t, 'a> = 't -> 'a //=M['a], result varies 

let returnM x _ = x 

let map f m = fun t -> t |> m |> f 

let apply f m = fun t -> t |> m |> (t |> f) 

let bind f m = fun t -> t |> (t |> m |> f) 

let Scanner dirs = 
    returnM dirs 
    |> apply (fun dirExHandler -> 
     Seq.collect (fun directory -> 
      try 
       Directory.GetFiles(directory, "*", SearchOption.AllDirectories) 
      with | e -> 
       dirExHandler e directory 
       Array.empty)) 
    |> map (fun filenames -> 
     returnM filenames 
     |> apply (fun (filenamepatterns, lineExHandler, fileExHandler) -> 
      Seq.filter (fun filename -> 
       filenamepatterns |> Seq.exists (fun pattern -> 
        let regex = new Regex(pattern) 
        regex.IsMatch(filename))) 
      >> Seq.map (fun filename -> 
        let fileinfo = new FileInfo(filename) 
        try 
         use reader = fileinfo.OpenText() 
         Seq.unfold (fun ((reader : StreamReader), index) -> 
          if not reader.EndOfStream then 
           try 
            let line = reader.ReadLine() 
            Some((line, index), (reader, index + 1)) 
           with | e -> 
            lineExHandler e filename index 
            None 
          else 
           None) (reader, 0)   
         |> (fun lines -> (filename, lines)) 
        with | e -> 
         fileExHandler e filename 
         (filename, Seq.empty)) 
      >> (fun files -> 
       returnM files 
       |> apply (fun contentpatterns -> 
        Seq.collect (fun file -> 
         let filename, lines = file 
         lines |> 
          Seq.collect (fun line -> 
           let content, index = line 
           contentpatterns 
           |> Seq.collect (fun pattern ->  
            let regex = new Regex(pattern) 
            regex.Matches(content) 
            |> (Seq.cast<Match> 
            >> Seq.map (fun contentmatch -> 
             (filename, 
              index, 
              contentmatch.Index, 
              pattern, 
              contentmatch.Value)))))))))) 

感謝您的任何輸入。

更新 - 這裏是一個基於反饋的任何更新的解決方案,我收到:

open System.Text.RegularExpressions 
open System.IO 

type ScannerConfiguration = { 
    FileNamePatterns : seq<string> 
    ContentPatterns : seq<string> 
    FileExceptionHandler : exn -> string -> unit 
    LineExceptionHandler : exn -> string -> int -> unit 
    DirectoryExceptionHandler : exn -> string -> unit } 

let scanner specifiedDirectories (configuration : ScannerConfiguration) = seq { 
    let ToCachedRegexList = Seq.map (fun pattern -> new Regex(pattern)) >> Seq.cache 

    let contentRegexes = configuration.ContentPatterns |> ToCachedRegexList 

    let filenameRegexes = configuration.FileNamePatterns |> ToCachedRegexList 

    let getLines exHandler reader = 
     Seq.unfold (fun ((reader : StreamReader), index) -> 
      if not reader.EndOfStream then 
       try 
        let line = reader.ReadLine() 
        Some((line, index), (reader, index + 1)) 
       with | e -> exHandler e index; None 
      else 
       None) (reader, 0) 

    for specifiedDirectory in specifiedDirectories do 
     let files = 
      try Directory.GetFiles(specifiedDirectory, "*", SearchOption.AllDirectories) 
      with e -> configuration.DirectoryExceptionHandler e specifiedDirectory; [||] 
     for file in files do 
      if filenameRegexes |> Seq.exists (fun (regex : Regex) -> regex.IsMatch(file)) then 
       let lines = 
        let fileinfo = new FileInfo(file) 
        try 
         use reader = fileinfo.OpenText() 
         reader |> getLines (fun e index -> configuration.LineExceptionHandler e file index) 
        with | e -> configuration.FileExceptionHandler e file; Seq.empty 
       for line in lines do 
        let content, index = line 
        for contentregex in contentRegexes do 
         for mmatch in content |> contentregex.Matches do 
          yield (file, index, mmatch.Index, contentregex.ToString(), mmatch.Value) } 

同樣,任何投入是值得歡迎的。

+2

你見過像Parsec這樣的函數解析器嗎? – 2012-01-10 15:58:39

+1

這是很多文字。試着把它分解成更容易閱讀。 – Marcin 2012-01-10 16:02:22

+0

我只是簡單地使用接口和對象表達式來創建一個實例並將其暴露給C#代碼。 – 2012-01-10 18:02:47

回答

8

我認爲最好的方法是從最簡單的解決方案開始,然後對其進行擴展。您目前的做法似乎是相當難念給我聽,原因有二:

  • 的代碼使用在不在F#太常見的模式有很多組合程序和功能組成的。一些處理可以使用序列表達式更容易地編寫。

  • 這段代碼全部寫成單個函數,但它相當複雜,如果它被分成多個函數,它會更具可讀性。

我可能會通過在測試單個文件的功能分裂代碼開始(比如​​),並且走到了文件並調用​​功能。主要迭代可以很好地利用編寫F#序列表達式:

// Checks whether a file name matches a filename pattern 
// and a content matches a content pattern 
let fileMatches fileNamePatterns contentPatterns 
       (fileExHandler, lineExHandler) file = 
    // TODO: This can be imlemented using 
    // File.ReadLines which returns a sequence 


// Iterates over all the files and calls 'fileMatches' 
let scanner specifiedDirectories fileNamePatterns contentPatterns 
      (dirExHandler, fileExHandler, lineExHandler) = seq { 
    // Iterate over all the specified directories 
    for specifiedDir in specifiedDirectories do 
    // Find all files in the directories (and handle exceptions)  
    let files = 
     try Directory.GetFiles(specifiedDir, "*", SearchOption.AllDirectories) 
     with e -> dirExHandler e specifiedDir; [||] 
    // Iterate over all files and report those that match 
    for file in files do 
     if fileMatches fileNamePatterns contentPatterns 
        (fileExHandler, lineExHandler) file then 
     // Matches! Return this file as part of the result. 
     yield file } 

的功能仍是很複雜的,因爲你需要通過周圍很多參數。包裝在一個簡單的類型或記錄的參數,可能是一個好主意:

type ScannerArguments = 
    { FileNamePatterns:string 
    ContentPatterns:string 
    FileExceptionHandler:exn -> string -> unit 
    LineExceptionHandler:exn -> string -> unit 
    DirectoryExceptionHandler:exn -> string -> unit } 

然後你就可以同時定義​​和scanner作爲僅舉兩個參數的功能,這將使你的代碼了很多更具可讀性。例如:

// Iterates over all the files and calls 'fileMatches' 
let scanner specifiedDirectories (args:ScannerArguments) = seq { 
    for specifiedDir in specifiedDirectories do 
    let files = 
     try Directory.GetFiles(specifiedDir, "*", SearchOption.AllDirectories) 
     with e -> args.DirectoryEceptionHandler e specifiedDir; [||] 
    for file in files do 
     // No need to propagate all arguments explicitly to other functions 
     if fileMatches args file then yield file }