2016-04-28 52 views
0

我們有一組txt格式的5000個KYC文件。我需要對他們執行NER,以閃亮的應用程序中的表格形式總結報告,這可以在我們的網站中使用。然而,解析文件花了很多時間超過30分鐘:(需要完成優化,任何機構都可以提供我需要實現的一些方法,文本文件是這種格式的文件R代碼中的性能問題

名稱 - XYZ 父親姓名 - ABC 地址 - 商城路,西姆拉,印度 婚姻狀況 - 結婚 年收入 - 盧比750000 就業 - 是 Guaranter - 先生高清 信用分析師點評 - XYZ沒有信用記錄可能是一個NPA在將來............. 和其他詳細信息

涉及的步驟: 1.預處理文件名(刪除數字,空格,因爲它以pdf格式上傳,然後轉換爲文本)

  1. 創建所有列的索引姓名,父親姓名,地址,婚姻狀況,年收入,年齡,信用分析師評論)

  2. 解析每個文件的函數。使用命名實體識別和其他技術來獲取關鍵字並忽略其他詞並將其映射到相應的列。該函數的名稱是parseAKYC(文件)。

  3. 在另一個函數parseallKYC中調用此函數。

  4. 函數parseallKYC(files_path)花費太多時間來完成,當有一大堆文件時。有六個文件,它給了我幾秒鐘的結果。我想使用包並行。誰能幫我嗎 ?所顯示的大多數例子都是爲了安慰,樂於助人。我們可以使用包並行來並行地實現我定義的函數parseAllKYC嗎?

  5. 下面是最終函數parseallKYC的代碼,如下所示。

    `

    #code for parallel parsing 
    library(foreach) 
    library(iterators) 
    library(doParallel) 
    fileloc <- "location of 5000 KYC files" 
    filelist <- list.files(path=fileloc,pattern = 'txt') 
    files <- "" 
    for (j in (1:length(filelist))) 
    { 
        files[j] <- paste0(fileloc,'/',filelist[j]) 
    } 
    no_cores <- detectCores() - 1 
    cl <- makeCluster(no_cores) 
    registerDoParallel(cl) 
    KYCTable <- foreach(i=iter(files),.combine=rbind) %dopar% 
    { 
        resume <- parseAKYC(i) 
    } 
    stopCluster(cl) 
    
    #code for parseAKYC function 
    require("NLP") 
    require("openNLPmodels.en") 
    require("openNLP") 
    library(tm) 
    library(DT) 
    
    preprocessFile <- function(file) { 
        file <- file[!duplicated(file)] 
        file <- gsub("\\f", "", file) 
        file <- gsub('""', "", file) 
        file <- gsub("Page\\d+", "", file) 
        file <- gsub("-+", "", file) 
        file <- file[file != ""] 
        return (file) 
    } 
    extract_People_Location_Org <- function(file) { 
        file <- lapply(file, removePunctuation) 
        file <- unlist(file) 
        s <- as.String(file) 
        sent_token_annotator <- Maxent_Sent_Token_Annotator() 
        gc() 
        word_token_annotator <- Maxent_Word_Token_Annotator() 
        a2 <- annotate(s, list(sent_token_annotator, word_token_annotator)) 
    
        ## Entity recognition for pepple's names. 
        entity_annotator_people <- Maxent_Entity_Annotator() 
        annotate(s, entity_annotator_people, a2) 
        if (length(entity_annotator_people(s, a2)) == 0) { 
        people_name <- "" 
        } else { 
        people_name <- s[entity_annotator_people(s, a2)] 
        } 
        if (length(people_name) > 1) { 
        people_name <- people_name[!duplicated(people_name)] 
        } 
        result1 <- paste(people_name, collapse = ", ") 
    
        ## Entity recognition for Location 
        entity_annotator_location <- Maxent_Entity_Annotator(kind = "location") 
        annotate(s, entity_annotator_location, a2) 
        ## Directly: 
        if (length(entity_annotator_location(s, a2)) == 0) { 
        location <- "" 
        } else { 
        location <- s[entity_annotator_location(s, a2)] 
        } 
        if (length(location) > 1) { 
        location <- location[!duplicated(location)] 
        } 
        result2 <- paste(location, collapse = ", ") 
    
        ## Entity recognition for Organization 
        entity_annotator_org <- Maxent_Entity_Annotator(kind = "organization") 
        annotate(s, entity_annotator_org, a2) 
        if (length(entity_annotator_org(s, a2)) == 0) { 
        org <- "" 
        } else { 
        org <- s[entity_annotator_org(s, a2)] 
        } 
        if (length(org) > 1) { 
        org <- org[!duplicated(org)] 
        } 
    
        result3 <- paste(org, collapse = ", ") 
        return (c(result1, result2, result3)) 
    } 
    extractCreditAnalystComments <- function(file) { 
        index <- makeIndex(file) 
        CreditAnalystComments <- paste(if (length(which(index == 6)) > 0) file[(which(index == 6)[1] + 1) : (tail(which(index == 6), 1))], collapse = ", ") 
        return (paste(CreditAnalystComments, collapse = ", ")) 
    } 
    makeIndex <- function(file) { 
        # create a blank vector to store index of respective field 
        # CODE: 1-Name 2-Job 3-Email 4-Language 5-Education 6-CreditAnalystCommentss (CreditAnalystCommentss & Expertise) 
        #  7-Experience (Experience, Volunteer Experience, Certifications) 
        #  8-Summary 9-Interests 10-Certifications 
    
        index <- rep(0, length(file)) 
        index[which(file == "Name")] <- 1 
        index[which(file == "Address")] <- 2 
        # index[which(grepl("@", file) == T)] <- 3 
        index[which(file == "Marital Status")] <- 4 
        index[which(file == "Annual Income")] <- 5 
        index[which(file == "Employed")] <- 6 
        index[which(file == "Guaranter")] <- 7 
        index[which(file == "CreditAnalystComments")] <- 8 
        index[which(file == "Interests")] <- 9 
        index[which(file == "Credit History")] <- 10 
    
        for (i in 1:(length(index)-1)) { 
        if (index[i+1] == 0) { 
         index[i+1] <- index[i] 
        } 
        } 
        return (index) 
    } 
    parseAKYC <- function(file_name) { 
        # input: a KYC in format *.txt 
    
    
        # read file text 
        file <- readLines(file_name, warn = F) 
    
        # preprocessing file 
        file <- preprocessFile(file) 
    
        KYC <- as.list(c("Name" = character(), "CreditAnalystComments" = character(), "Employed" = character(), 
           "Address" = character(), "Annual Income" = character(), 
           "Guaranter" = character())) 
        KYC$Name <- file[1] 
        KYC$CreditAnalystComments <- extractCreditAnalystComments(file) 
        x <- extract_People_Location_Org(file) 
        # ------------------------------------------------------------- 
    
        CreditAnalystComments.split <- unlist(strsplit(KYC$CreditAnalystComments, split = ",")) 
        CreditAnalystComments.split <- gsub("^\\s+", "", CreditAnalystComments.split) 
        Employed.split <- unlist(strsplit(x[3], split = ",")) 
        Employed.split <- gsub("^\\s+", "", Employed.split) 
        Employed_not_in_credit <- Employed.split[-which(Employed.split %in% CreditAnalystComments.split)] 
        Employed<- paste0(Employed_not_in_CreditAnalystComments, collapse = ", ") 
        # ------------------------------------------------------------- 
    
        # ------------------------------------------------------------- 
    
        Guaranter.split <- unlist(strsplit(x[1], split = ",")) 
        Guaranter.split <- gsub("^\\s+", "", Guaranter.split) 
        Guaranter_not_in_CreditAnalystComments <- Guaranter.split[-which(Guaranter.split %in% CreditAnalystComments.split)] 
        Guaranter <- paste0(Guaranter_not_in_CreditAnalystComments, collapse = ", ") 
        # ------------------------------------------------------------- 
    
        KYC$Employed <- Employed 
        # remember to change Java heap size memory to at leats 2GB 
        KYC$Address <- x[2] 
        #KYC$Designation <- file[2] 
        KYC$Guaranter <- Guaranter 
        return (as.data.frame(KYC, stringsAsFactors = F)) 
    } 
        parseAllKYC <- function(files_path) { 
          KYC .df <- data.frame(Name = character(), FatherName = character(), 
             Address = character(), maritalstatus = character(), 
             Annualincome = character(), 
             CreditAnalystComments= character(),stringsAsFactors=FALSE) 
    
          for (i in files_path) { 
          KYC <- parseAKYC(i) 
          KYC.df <- rbind(KYC.df, KYC) 
          } 
          return (KYC.df) 
         } 
    
    
    
    
    
         #ui.R 
    
         fluidPage(fluidRow(column(12,DT::dataTableOutput('tbl')))) 
    
    
    
    
          #server.R 
           library(shiny) 
           library(DT) 
           source("getKYCTable.R") 
           function(input, output, session) 
           { 
            output$tbl =DT::renderDataTable(KYCTable ,filter ='top',options = list(lengthChange = FALSE) 
           ) 
           } 
    
    
    
    
        ` 
    
+0

你想運行在多個內核的代碼? – TUSHAr

+0

是的。出錯。這裏是使用的代碼。 –

+0

你能確保你正確地縮進你的代碼(parseAllKYC)。另外,我在parseAllKYC函數中看不到parseAResume。 – TUSHAr

回答

0

試試這個: 雖然我覺得有可能是您的parseAKYC函數內的優化問題,而是讓你去,你可以通過以下方式parallelise。這是爲ubuntu安裝。對於Windows,您可以查看doSNOW包。

使用下列程序包:

require(doParallel) 
require(foreach) 
require(doMC) #for ubuntu 
require(doSNOW) #for windows 

registerDoMC(cores = 4) #set value based on the number of cores 
         #on your machine 
#assuming file_path is a character vector of complete file paths. 
parsed.output <- foreach(i=1:length(files_path)) %dopar% parseAKYC(files_path[i]) 
+0

我已經添加了parseAKYC功能。實現了並行代碼,但它在函數「extract_People_Location_Org」中引發了一個錯誤「找不到函數」as.String「」 –

+0

,您正在使用as.String函數。第三行:s < - as。字符串(文件),我想你應該使用,這是一個函數在NLP as.character() – TUSHAr

+0

。我轉換爲ascharacter。它不允許像lapply(file,removePunctuation),unlist(file)這樣的許多函數。我無法刪除這些。你能提出一些適合NLP並行處理的優秀包嗎?提前致謝。你一直在幫助很大。 –