2017-02-15 181 views
2

當使用下面的工具將大型excel文件轉換爲csv時,由於Excel格式定義爲*格式,某些日期值轉換不正確。Java:excel與csv日期轉換Apache Poi的問題

代碼位置:https://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/xssf/eventusermodel/XLSX2CSV.java

/* ==================================================================== 
    Licensed to the Apache Software Foundation (ASF) under one or more 
    contributor license agreements. See the NOTICE file distributed with 
    this work for additional information regarding copyright ownership. 
    The ASF licenses this file to You under the Apache License, Version 2.0 
    (the "License"); you may not use this file except in compliance with 
    the License. You may obtain a copy of the License at 

     http://www.apache.org/licenses/LICENSE-2.0 

    Unless required by applicable law or agreed to in writing, software 
    distributed under the License is distributed on an "AS IS" BASIS, 
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
    See the License for the specific language governing permissions and 
    limitations under the License. 
==================================================================== */ 

package org.apache.poi.xssf.eventusermodel; 

import java.io.File; 
import java.io.IOException; 
import java.io.InputStream; 
import java.io.PrintStream; 

import javax.xml.parsers.ParserConfigurationException; 

import org.apache.poi.openxml4j.exceptions.OpenXML4JException; 
import org.apache.poi.openxml4j.opc.OPCPackage; 
import org.apache.poi.openxml4j.opc.PackageAccess; 
import org.apache.poi.ss.usermodel.DataFormatter; 
import org.apache.poi.ss.util.CellAddress; 
import org.apache.poi.ss.util.CellReference; 
import org.apache.poi.util.SAXHelper; 
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; 
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; 
import org.apache.poi.xssf.model.StylesTable; 
import org.apache.poi.xssf.usermodel.XSSFComment; 
import org.xml.sax.ContentHandler; 
import org.xml.sax.InputSource; 
import org.xml.sax.SAXException; 
import org.xml.sax.XMLReader; 

/** 
* A rudimentary XLSX -> CSV processor modeled on the 
* POI sample program XLS2CSVmra from the package 
* org.apache.poi.hssf.eventusermodel.examples. 
* As with the HSSF version, this tries to spot missing 
* rows and cells, and output empty entries for them. 
* <p/> 
* Data sheets are read using a SAX parser to keep the 
* memory footprint relatively small, so this should be 
* able to read enormous workbooks. The styles table and 
* the shared-string table must be kept in memory. The 
* standard POI styles table class is used, but a custom 
* (read-only) class is used for the shared string table 
* because the standard POI SharedStringsTable grows very 
* quickly with the number of unique strings. 
* <p/> 
* For a more advanced implementation of SAX event parsing 
* of XLSX files, see {@link XSSFEventBasedExcelExtractor} 
* and {@link XSSFSheetXMLHandler}. Note that for many cases, 
* it may be possible to simply use those with a custom 
* {@link SheetContentsHandler} and no SAX code needed of 
* your own! 
*/ 
public class XLSX2CSV { 
    /** 
    * Uses the XSSF Event SAX helpers to do most of the work 
    * of parsing the Sheet XML, and outputs the contents 
    * as a (basic) CSV. 
    */ 
    private class SheetToCSV implements SheetContentsHandler { 
     private boolean firstCellOfRow = false; 
     private int currentRow = -1; 
     private int currentCol = -1; 

     private void outputMissingRows(int number) { 
      for (int i=0; i<number; i++) { 
       for (int j=0; j<minColumns; j++) { 
        output.append(','); 
       } 
       output.append('\n'); 
      } 
     } 

     @Override 
     public void startRow(int rowNum) { 
      // If there were gaps, output the missing rows 
      outputMissingRows(rowNum-currentRow-1); 
      // Prepare for this row 
      firstCellOfRow = true; 
      currentRow = rowNum; 
      currentCol = -1; 
     } 

     @Override 
     public void endRow(int rowNum) { 
      // Ensure the minimum number of columns 
      for (int i=currentCol; i<minColumns; i++) { 
       output.append(','); 
      } 
      output.append('\n'); 
     } 

     @Override 
     public void cell(String cellReference, String formattedValue, 
       XSSFComment comment) { 
      if (firstCellOfRow) { 
       firstCellOfRow = false; 
      } else { 
       output.append(','); 
      } 

      // gracefully handle missing CellRef here in a similar way as XSSFCell does 
      if(cellReference == null) { 
       cellReference = new CellAddress(currentRow, currentCol).formatAsString(); 
      } 

      // Did we miss any cells? 
      int thisCol = (new CellReference(cellReference)).getCol(); 
      int missedCols = thisCol - currentCol - 1; 
      for (int i=0; i<missedCols; i++) { 
       output.append(','); 
      } 
      currentCol = thisCol; 

      // Number or string? 
      try { 
       //noinspection ResultOfMethodCallIgnored 
       Double.parseDouble(formattedValue); 
       output.append(formattedValue); 
      } catch (NumberFormatException e) { 
       output.append('"'); 
       output.append(formattedValue); 
       output.append('"'); 
      } 
     } 

     @Override 
     public void headerFooter(String text, boolean isHeader, String tagName) { 
      // Skip, no headers or footers in CSV 
     } 
    } 


    /////////////////////////////////////// 

    private final OPCPackage xlsxPackage; 

    /** 
    * Number of columns to read starting with leftmost 
    */ 
    private final int minColumns; 

    /** 
    * Destination for data 
    */ 
    private final PrintStream output; 

    /** 
    * Creates a new XLSX -> CSV converter 
    * 
    * @param pkg  The XLSX package to process 
    * @param output  The PrintStream to output the CSV to 
    * @param minColumns The minimum number of columns to output, or -1 for no minimum 
    */ 
    public XLSX2CSV(OPCPackage pkg, PrintStream output, int minColumns) { 
     this.xlsxPackage = pkg; 
     this.output = output; 
     this.minColumns = minColumns; 
    } 

    /** 
    * Parses and shows the content of one sheet 
    * using the specified styles and shared-strings tables. 
    * 
    * @param styles The table of styles that may be referenced by cells in the sheet 
    * @param strings The table of strings that may be referenced by cells in the sheet 
    * @param sheetInputStream The stream to read the sheet-data from. 

    * @exception java.io.IOException An IO exception from the parser, 
    *   possibly from a byte stream or character stream 
    *   supplied by the application. 
    * @throws SAXException if parsing the XML data fails. 
    */ 
    public void processSheet(
      StylesTable styles, 
      ReadOnlySharedStringsTable strings, 
      SheetContentsHandler sheetHandler, 
      InputStream sheetInputStream) throws IOException, SAXException { 
     DataFormatter formatter = new DataFormatter(); 
     InputSource sheetSource = new InputSource(sheetInputStream); 
     try { 
      XMLReader sheetParser = SAXHelper.newXMLReader(); 
      ContentHandler handler = new XSSFSheetXMLHandler(
        styles, null, strings, sheetHandler, formatter, false); 
      sheetParser.setContentHandler(handler); 
      sheetParser.parse(sheetSource); 
     } catch(ParserConfigurationException e) { 
      throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); 
     } 
    } 

    /** 
    * Initiates the processing of the XLS workbook file to CSV. 
    * 
    * @throws IOException If reading the data from the package fails. 
    * @throws SAXException if parsing the XML data fails. 
    */ 
    public void process() throws IOException, OpenXML4JException, SAXException { 
     ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(this.xlsxPackage); 
     XSSFReader xssfReader = new XSSFReader(this.xlsxPackage); 
     StylesTable styles = xssfReader.getStylesTable(); 
     XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); 
     int index = 0; 
     while (iter.hasNext()) { 
      InputStream stream = iter.next(); 
      String sheetName = iter.getSheetName(); 
      this.output.println(); 
      this.output.println(sheetName + " [index=" + index + "]:"); 
      processSheet(styles, strings, new SheetToCSV(), stream); 
      stream.close(); 
      ++index; 
     } 
    } 

    public static void main(String[] args) throws Exception { 
     if (args.length < 1) { 
      System.err.println("Use:"); 
      System.err.println(" XLSX2CSV <xlsx file> [min columns]"); 
      return; 
     } 

     File xlsxFile = new File(args[0]); 
     if (!xlsxFile.exists()) { 
      System.err.println("Not found or not a file: " + xlsxFile.getPath()); 
      return; 
     } 

     int minColumns = -1; 
     if (args.length >= 2) 
      minColumns = Integer.parseInt(args[1]); 

     // The package open is instantaneous, as it should be. 
     OPCPackage p = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ); 
     XLSX2CSV xlsx2csv = new XLSX2CSV(p, System.out, minColumns); 
     xlsx2csv.process(); 
     p.close(); 
    } 
} 

Input format Image - Click here to see the formatting for dates

輸入:

date1  date2 
1/1/1900 1/1/1900 
2/28/2012 2/28/2012 
3/15/1965 3/15/1965 
1/1/2000 1/1/2000 
1/1/2100 1/1/2100 
1/1/2115 1/1/2115 

輸出:

date1  date2 
1/1/2000 1/1/1900 
2/28/2012 2/28/2012 
3/15/1965 3/15/1965 
1/1/2000 1/1/2000 
1/1/2000 1/1/2100 
1/1/2015 1/1/2115 

如果你看看輸入數據, date1列)格式化爲使用區域設置的星號,具有該格式的單元格會受到影響,因爲1900會轉換爲2000,所以2099以上的任何數據...如果單元格(即Date2列)格式化爲不帶*,則值爲按預期出來。這是該實用程序的限制還是有解決方法?

如果我以另一種方式格式化單元而不應用區域設置格式,我會得到正確的輸出。

回答

1

無法使用apache poi 3.15 final來重現該行爲。

已顯示的代碼產生以下輸出:

Sheet1 [index=0]: 
"date1","date2" 
"1/1/00","1/1/1900" 
"2/28/12","2/28/2012" 
"3/15/65","3/15/1965" 
"1/1/00","1/1/2000" 
"1/1/00","1/1/2100" 
"1/1/15","1/1/2115" 

所以對於默認日期格式m/d/yy被使用(格式-ID = 0xe =短= * 3/14/2012年的日期)。這是在BuiltinFormats中定義的那個。

當然,如果你把這個輸出直接放到一個CSV文件中,那麼目前還沒有一個世紀,Excel在打開這個CSV文件時必須猜測這個世紀。

你可以改變代碼一點點,像:

... 
    public void processSheet(
      StylesTable styles, 
      ReadOnlySharedStringsTable strings, 
      SheetContentsHandler sheetHandler, 
      InputStream sheetInputStream) throws IOException, SAXException { 
     DataFormatter formatter = new DataFormatter() { 
      @Override 
      public String formatRawCellContents(double value, int formatIndex, String formatString, boolean use1904Windowing) { 
       if ("m/d/yy".equals(formatString)) formatString = "m/d/yyyy"; 
       return super.formatRawCellContents(value, formatIndex, formatString, use1904Windowing); 
      } 
     }; 
     InputSource sheetSource = new InputSource(sheetInputStream); 
     try { 
      XMLReader sheetParser = SAXHelper.newXMLReader(); 
      ContentHandler handler = new XSSFSheetXMLHandler(
        styles, null, strings, sheetHandler, formatter, false); 
      sheetParser.setContentHandler(handler); 
      sheetParser.parse(sheetSource); 
     } catch(ParserConfigurationException e) { 
      throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); 
     } 
    } 
... 

這將產生以下輸出:

Sheet1 [index=0]: 
"date1","date2" 
"1/1/1900","1/1/1900" 
"2/28/2012","2/28/2012" 
"3/15/1965","3/15/1965" 
"1/1/2000","1/1/2000" 
"1/1/2100","1/1/2100" 
"1/1/2115","1/1/2115" 
+0

謝謝阿克塞爾,這看起來乾淨的做法。我弄明白了在XMLSheet處理程序中的格式字符串作爲m/d/yy,我改爲m/d/yyyy ...但不使用自定義xml處理程序,最好使用此做法。我會測試這個並讓你知道。謝謝.. – Dvusrgme