2016-09-24 54 views
0

我從XSLX產生用Groovy一個ARFF文件, 沒有宣佈,但是當我嘗試秧雞打開這個文件我得到這個錯誤:標稱值在頭

File "..." not recognised as an 'Arff data files' file. Reason: nominal value not declared in header, read Token[Ativo], line 16

我無法理解爲什麼我得到這個錯誤 有人可以幫助我解決這個錯誤,並解釋它爲什麼會發生?

生成的文件

@relation kd-itempedido 
@attribute tipopedido {Assistencia,Recompra,Venda,Troca} 
@attribute aprovado {0.0,1.0} 
@attribute fasepedido {Aprovado,Cancelado,EmAprovacao,Liberado,Novo} 
@attribute statusinternopedido {NegociarPagamento,PedidosDeTeste,AguardandoOcorrencia,Nada,AguardandoBoletoDeposito,PedidoDuplicado,SuspeitaDeFraude} 
@attribute canal {Marketplace,Desktop} 
@attribute origem {LojasAmericanas,Optimise,MercadoLivre,Cityads,Zanox,Zoom,Rakuten,Lomadee,Facebook,Viptarget,Submarino,Criteo,Muccashop,Chaordic,Walmart,Googlead,Nada,Extra,Lojaskd,Shopback,Afilio,Shoptime,Nextperformance,CarrinhoAbandonado,Bing} 
@attribute mercado {S,N} 
@attribute cluster {EntregaImediata,Fiprec,Icconv,Esgotado} 
@attribute statusitem {Ativo} 
@attribute statusproduto {Inativo,Ativo,AtivoSemEstoque,ForaDeLinha} 
@attribute polo {Polo1,Polo3,Polo2} 
@data 
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Fiprec,Ativo,Ativo,Polo2 
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Fiprec,Ativo,Ativo,Polo2 
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Ativo,Inativo,Polo2 
Venda,0.0,Novo,Nada,Desktop,Muccashop,N,Ativo,Ativo,Polo3 

Groovy的(VM -Dfile.encoding = ASCII UTF8 UTF8)

@Grapes([ 
     @Grab('org.apache.poi:poi:3.10.1'), 
     @Grab('org.apache.poi:poi-ooxml:3.10.1')]) 
import org.apache.poi.xssf.usermodel.XSSFWorkbook 
import java.text.Normalizer 
import static org.apache.poi.ss.usermodel.Cell.* 
import java.nio.file.Paths 

def path = "/home/eric/Documents/development/ufpr/Solid Eric/ItemPedido1000.xlsx" 
def relation = "kd-itempedido" 
def columns = ["tipopedido", "aprovado", "fasepedido", "statusinternopedido", "canal", "origem", "mercado", "cluster", "statusitem","statusproduto", "polo"] 
def arff = "ItemPedido.arff" 
new XslxToArffParser(path, relation, columns, arff); 

class Data{ 
    def rows = new ArrayList<List>(); 

    @Override 
    String toString() { 
     def s = "" 
     for (r in rows){ 
      for(d in r){ 

       s+=d 
       if(r.indexOf(d) < (r.size()-1)) 
        s+="," 
      } 
      s+="\n" 
     } 
     return s 
    } 
} 



class Atributo { 
    def descricao; 
    def possibilidades = new HashSet<Object>(); 
    def index; 

    @Override 
    String toString() { 

     def builder = new StringBuilder() 
     builder.append("@attribute ").append(descricao) 
     builder.append(" {") 
     for(def i = 0; i<possibilidades.size(); i++){ 
      builder.append(possibilidades[i]) 
      if((i+1) != possibilidades.size()) 
       builder.append(",") 
     } 
     builder.append("}").append("\n") 
     return builder.toString(); 
    } 
} 

class XslxToArffParser { 
    def attributes =[:]; 
    def data = new Data(); 
    def sheet = null; 

    XslxToArffParser(path, relation, columns, arffPath){ 
     load(path) 
     getAttributes(columns) 
     collectData() 
     saveArff(relation, arffPath) 
    } 

    def String parse(String s){ 
     s = Normalizer.normalize(s, Normalizer.Form.NFD) 
     s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}]", "") 
     s = s.split(/[^\w]/).collect { it.toLowerCase().capitalize() }.join("") 
     s = s.replaceAll(" ", "") 
     s = s.replaceAll("[^A-Za-z0-9]", "") 
     s = s.isEmpty() ? "Nada" : s 
     return s 
    } 

    def load(path) { 
     Paths.get(path).withInputStream { input -> 
      def workbook = new XSSFWorkbook(input) 
      sheet = workbook.getSheetAt(0) 
     } 
    } 

    def getAttributes(columns){ 
     for (cell in sheet.getRow(0).cellIterator()) { 
      def index = cell.columnIndex 
      def description = parse(cell.stringCellValue).toLowerCase() 
      if(columns.contains(description)){ 
       attributes << [(index):new Atributo(descricao: description, index: index)] 
      } 
     } 
    } 

    def collectData(){ 
     def headerFlag = true 
     for (row in sheet.rowIterator()) { 
      if (headerFlag) { 
       headerFlag = false 
       continue 
      } 
      def r = [] 
      for (cell in row.cellIterator()) { 
       def index = cell.columnIndex; 
       def value = cell.cellType == CELL_TYPE_STRING ? parse(cell.stringCellValue) : cell.numericCellValue 

       def attr = attributes[index] 
       if(attr != null){ 
        attr.possibilidades.add(value) 
        r << value 
       } 
      } 

      data.rows.add(r) 
     } 
    } 

    def saveArff(relation, path){ 
     Paths.get(path).withWriter { writer -> 

      writer.write "@relation " + relation 
      writer.write "\n" 
      for(a in attributes.values()) 
       writer.write a.toString() 

      writer.write "@data" 
      writer.write "\n" 

      writer.write data.toString() 
     } 
    } 
} 

解決。 「row.cellIterator()」不會迭代空/空單元格

回答

0

自從我使用Weka以來,我一直在觀察你顯示的文件和錯誤消息,我懷疑問題出現在最後兩個數據文件的行。它們沒有屬性「集羣」的值。

之後的S或N(屬性「mercado」),他們有「Ativo」。該「Ativo」值未被定義爲標稱屬性集羣的可能值之一。雖然該文件讀取了「Ativo」(這就是爲什麼錯誤消息顯示爲''讀取Token [Ativo]'',但它期望爲羣集屬性讀取一個值,但它尚未期望statusitem屬性的值。