/** * '$RCSfile: DelimitedReader.java,v $' * * '$Author: leinfelder $' * '$Date: 2008-02-29 23:23:36 $' * '$Revision: 1.9 $' * * For Details: http://kepler.ecoinformatics.org * * Copyright (c) 2003 The Regents of the University of California. * All rights reserved. * * Permission is hereby granted, without written agreement and without * license or royalty fees, to use, copy, modify, and distribute this * software and its documentation for any purpose, provided that the * above copyright notice and the following two paragraphs appear in * all copies of this software. * * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN * IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE * PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY * OF CALIFORNIA HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, * UPDATES, ENHANCEMENTS, OR MODIFICATIONS. */ package org.ecoinformatics.datamanager.database; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.TreeSet; import java.util.Vector; import org.ecoinformatics.datamanager.parser.Entity; import org.ecoinformatics.datamanager.quality.QualityCheck; import org.ecoinformatics.datamanager.quality.QualityCheck.Status; import org.ecoinformatics.datamanager.quality.QualityReport; /** * tokenizes a delimited file. This reader assumes that one record is on one * line which ends with the line */ public class DelimitedReader extends TextDataReader { /* * Instance fields */ private String data; private InputStreamReader dataReader; private Vector[] lines; private Vector linesVector; private int numHeaderLines; private int numRecords; private boolean stripHeader = false; private int numCols; private String fieldDelimiter; private String lineEnding; private boolean collapseDelimiters = false; private int numFooterLines = 0; private Vector footerBuffer = new Vector(); private boolean initializedFooterBuffer = false; private int headLineNumberCount = 0; private String quoteCharacter = null; private String literalCharacter = null; private boolean includeLiteralCharacter = false; private Entity entity; private int rowCounter = 0; // Used for quality reporting purposes private int tooFewFieldsCounter = 0; // Counts 'tooFewFields' errors private int tooManyFieldsCounter = 0; // Counts 'tooManyFields' errors private final int FIELD_CHECK_MAX = 5; // Max number of field count checks to report private int examineRecordDelimiterCounter = 0; // Counts 'examineRecordDelimiter' checks private final int EXAMINE_RECORD_DELIMITER_MAX = 1; // Max number of examineRecordDelimiter checks private boolean hasRecordDelimiter = false; // Set to true when the record delimiter is found in the data table /*private static Log log; static { log = LogFactory.getLog("org.ecoinformatics.util.DelimitedReader"); }*/ /* * Constructors */ /** * Constructor. Reads the csv (comma-separated values) stream. * * @param data the delimited stream to read as a string * @param numCols the number of columns in the stream * @param delimiter the delimiter string to tokenize on * @param numHeaderLines the number of lines to skip at the top of the file * @param lineEnding the line ending char(s)...either "\n" (Unix), * "\r\n" (Windows) or "\r" (Mac) * @param numRecords the number of rows in the data string */ public DelimitedReader(String data, int numCols, String delimiter, int numHeaderLines, String lineEnding, int numRecords) throws Exception { this.numHeaderLines = numHeaderLines; this.data = data; this.numCols = numCols; this.numRecords = numRecords; //log.debug("Delimiter is: " + delimiter); this.fieldDelimiter = unescapeDelimiter(delimiter); //log.debug("LineEnding is: " + lineEnding); this.lineEnding = unescapeDelimiter(lineEnding); //lines = new Vector[numRecords + numHeaderLines + 1]; linesVector = new Vector(); int begin = 0; int end = 0; // int i = 0; while(end < data.length()) { //add each line of the string as an element in a vector end = data.indexOf(this.lineEnding, begin); //DFH 'this.' added if(end == -1) { end = data.length(); } String line = data.substring(begin, end); if(!line.trim().equals("")) { //take off the line ending // MBJ: I commented out the next line as it was improperly truncating // lines. I'm not sure why it was there in the first place, as the // previous substring removed the delimiter //line = line.substring(0, line.length() - lineEnding.length()); //split the line based on the delimiter Vector v = splitDelimitedRowStringIntoVector(line); /*String[] s = line.split(delimiter.trim(), numCols); Vector v = new Vector(); for(int j=0; j */ public Vector getOneRowDataVector() throws Exception { rowCounter++; if (!initializedFooterBuffer) { for (int i = 0; i < numFooterLines; i++) { String rowData = readOneRowDataString(); footerBuffer.add(rowData); } // this is for no footer lines if (numFooterLines == 0) { String rowData = readOneRowDataString(); footerBuffer.add(rowData); } initializedFooterBuffer = true; } String nextRowData = readOneRowDataString(); String oneRowDataString = null; Vector oneRowDataVector = new Vector(); if (nextRowData != null) { oneRowDataString = (String) footerBuffer.remove(0); reIndexFooterBufferVector(); footerBuffer.add(nextRowData); } else if (numFooterLines == 0 && !footerBuffer.isEmpty()) { oneRowDataString = (String)footerBuffer.remove(0); } if (oneRowDataString != null) { /* * Quality check: 'examineRecordDelimiter' */ if (examineRecordDelimiterCounter < EXAMINE_RECORD_DELIMITER_MAX) { /* * If no valid record delimiter is specified in metadata, first row of * data is examined and a potential delimiter displayed. */ String examineRecordDelimiter = "examineRecordDelimiter"; QualityCheck examineRecordDelimiterTemplate = QualityReport .getQualityCheckTemplate(examineRecordDelimiter); QualityCheck examineRecordDelimiterQualityCheck = new QualityCheck( examineRecordDelimiter, examineRecordDelimiterTemplate); if (QualityCheck.shouldRunQualityCheck(entity, examineRecordDelimiterQualityCheck)) { String found = null; String metadataRecordDelimiter = entity.getMetadataRecordDelimiter(); /* * If metadata didn't specify a valid record delimiter, check * whether other potential candidates can be identified. */ ArrayList otherDelimiters = otherRecordDelimiters(oneRowDataString, metadataRecordDelimiter); boolean hasSuggestedDelimiter = entity.isSuggestedRecordDelimiter(metadataRecordDelimiter); if (otherDelimiters.size() > 0) { found = "Other potential record delimiters were found in the first row: "; found += otherDelimiters.toString(); examineRecordDelimiterQualityCheck.setFailedStatus(); } else { found = "No other potential record delimiters were detected."; if (hasSuggestedDelimiter) { found += " A valid record delimiter was previously detected"; examineRecordDelimiterQualityCheck.setStatus(Status.valid); examineRecordDelimiterQualityCheck.setExplanation(""); examineRecordDelimiterQualityCheck.setSuggestion(""); } else { examineRecordDelimiterQualityCheck.setFailedStatus(); } } examineRecordDelimiterQualityCheck.setFound(found); entity.addQualityCheck(examineRecordDelimiterQualityCheck); } examineRecordDelimiterCounter++; } oneRowDataVector = splitDelimitedRowStringIntoVector(oneRowDataString); } return oneRowDataVector; } /* * Used in quality reporting for the 'examineRecordDelimiter' quality check. * Check whether a row of data contains other potential record delimiters * besides the record delimiter specified in the metadata. */ private ArrayList otherRecordDelimiters(String row, String metadataDelimiter) { boolean foundTwoCharacterDelimiter = false; ArrayList otherDelimiters = new ArrayList(); if (row != null) { if (row.contains("\r\n")) { foundTwoCharacterDelimiter = true; if (metadataDelimiter == null || (!metadataDelimiter.equals("\\r\\n") && !metadataDelimiter.equalsIgnoreCase("#x0D#x0A") ) ) { otherDelimiters.add("\\r\\n"); } } if (row.contains("\n")) { if (metadataDelimiter == null || (!metadataDelimiter.equals("\\n") && !metadataDelimiter.equalsIgnoreCase("#x0A") && !foundTwoCharacterDelimiter ) ) { otherDelimiters.add("\\n"); } } if (row.contains("\r")) { if (metadataDelimiter == null || (!metadataDelimiter.equals("\\r") && !metadataDelimiter.equalsIgnoreCase("#x0D") && !foundTwoCharacterDelimiter ) ) { otherDelimiters.add("\\r"); } } } return otherDelimiters; } /* * This method will read a row of data from a vector. It * discards the header lines but it doesn't discard the footer lines. * This method is called by method getRowDataVectorFromStream(). * * @return A string holding one row of data. */ private String readOneRowDataString() { StringBuffer rowBuffer = new StringBuffer(); String rowDataString = null; int singleCharacter; if (dataReader != null) { try { // Read the first character to start things off singleCharacter = dataReader.read(); while (singleCharacter != -1) { // singleCharacter is not the EOF character char aCharacter = (char) singleCharacter; rowBuffer.append(aCharacter); // Check for a line ending character in the row data if (rowBuffer.indexOf(lineEnding) != -1) { // Strip the header lines if (stripHeader && numHeaderLines > 0 && headLineNumberCount < numHeaderLines) { // Reset string buffer (discard the header line) rowBuffer = null; rowBuffer = new StringBuffer(); } else { rowDataString = rowBuffer.toString(); hasRecordDelimiter = true; break; } headLineNumberCount++; } // Read the next character before looping back singleCharacter = dataReader.read(); } } catch (Exception e) { // Couldn't read data from input stream e.printStackTrace(); rowBuffer = new StringBuffer(); } } // If we have data for the row, then return it if (rowBuffer != null && rowBuffer.length() > 0) { rowDataString = rowBuffer.toString(); } return rowDataString; } /* * This method will forward one index for every element, 1 -> 0, 2->1 */ private void reIndexFooterBufferVector() { for (int i=0; i splitDelimitedRowStringIntoVector(String data) throws Exception { Vector rowVector = new Vector(); if (data == null) { return rowVector; } String[] stringArray = null; /* * If there is no quote character, we can split data directly */ if (quoteCharacter == null && literalCharacter == null) { String delimiterRegex = collapseDelimiters ? fieldDelimiter + "+" : fieldDelimiter; stringArray = data.split(delimiterRegex); } /* * Else, we should skip any field delimiters * found between pairs of quote characters. */ else { stringArray = processQuoteCharacterOneRowData(data); } if (stringArray != null) { int columnCounter = stringArray.length; /* * Quality check for too few fields */ String tooFewIdentifier = "tooFewFields"; QualityCheck tooFewTemplate = QualityReport.getQualityCheckTemplate(tooFewIdentifier); QualityCheck tooFewCheck = new QualityCheck(tooFewIdentifier, tooFewTemplate); if (QualityCheck.shouldRunQualityCheck(entity, tooFewCheck)) { boolean foundTooFew = (columnCounter < numCols); if (foundTooFew) { String expected = numCols + " " + fieldWord(numCols); tooFewCheck.setExpected(expected); String found = columnCounter + " " + fieldWord(columnCounter); tooFewCheck.setFound(found); String explanation = "In row " + rowCounter + ", fewer fields were found in the row than were expected: "; tooFewCheck.setFailedStatus(); explanation += ""; tooFewCheck.setExplanation(explanation); tooFewFieldsCounter++; // Limit the number of these checks included in the quality report if (tooFewFieldsCounter <= FIELD_CHECK_MAX) { entity.addQualityCheck(tooFewCheck); } } } /* * Quality check for too many fields */ String tooManyIdentifier = "tooManyFields"; QualityCheck tooManyTemplate = QualityReport.getQualityCheckTemplate(tooManyIdentifier); QualityCheck tooManyCheck = new QualityCheck(tooManyIdentifier, tooManyTemplate); if (QualityCheck.shouldRunQualityCheck(entity, tooManyCheck)) { boolean foundTooMany = (columnCounter > numCols); if (foundTooMany) { String expected = numCols + " " + fieldWord(numCols); tooManyCheck.setExpected(expected); String found = columnCounter + " " + fieldWord(columnCounter); tooManyCheck.setFound(found); String explanation = null; String truncatedData = data.trim(); if (truncatedData.length() > 200) { truncatedData = truncatedData.substring(0, 200) + "... (truncated)"; } explanation = "In row " + rowCounter + ", more fields were found in the row than were expected: "; tooManyCheck.setFailedStatus(); explanation += ""; tooManyCheck.setExplanation(explanation); tooManyFieldsCounter++; // Limit the number of these checks included in the quality report if (tooManyFieldsCounter <= FIELD_CHECK_MAX) { entity.addQualityCheck(tooManyCheck); } } } if (columnCounter > numCols) { throw new DataNotMatchingMetadataException( "Metadata specifies that data has " + numCols + " columns, but the actual data has " + columnCounter + " columns. Please check that the metadata is correct."); } for (int j = 0; j < stringArray.length; j++) { if (stringArray[j] != null) { rowVector.addElement(stringArray[j].trim()); } else { rowVector.addElement(""); } } /* * Pad missing fields with empty strings so that all the records * have the same number of columns. */ int rowVectorSize = rowVector.size(); if (rowVectorSize < numCols) { for (int j = 0; j < (numCols - rowVectorSize); j++) { rowVector.addElement(""); } } } return rowVector; } /* * Returns singular or plural version of the word "field" for use * in Quality Check output. */ private String fieldWord(int numFields) { return ((numFields == 1) ? "field" : "fields"); } /* * In oneRowData, there are quote character in it. Any field delimiter in the * quotes should be skipped. */ private String[] processQuoteCharacterOneRowData(String oneRowData) throws Exception { String[] elements = null; Vector elementsVector = new Vector(); if (oneRowData == null) { return elements; } quoteCharacter = transformQuoteCharacter(quoteCharacter); char quote = '#'; boolean quoted = false; if (quoteCharacter != null) { quoted = true; quote = quoteCharacter.charAt(0); } char literal = '/'; boolean literaled = false; if (literalCharacter != null) { literaled = true; literal = literalCharacter.charAt(0); } if (literaled && literalCharacter.length() !=1) { throw new Exception("Literal Character length should be 1 character in EML"); } char currentChar ='2'; StringBuffer fieldData = new StringBuffer(); int length = oneRowData.length(); int priviousDelimiterIndex = -2; int currentDelimiterIndex = -2; int delimiterLength = fieldDelimiter.length(); boolean startQuote = false; boolean delimiterAtEnd = false; //this string buffer is only for deleting if hit a delimiter StringBuffer delimiterStorage = new StringBuffer(fieldDelimiter.length()); for (int i=0; i= 0) { previousChar = oneRowData.charAt(i-1); if (previousChar == literal) { escapingQuote = true; // delette the literal character if (!includeLiteralCharacter) { //if we don't want literal character in the data, //we should delete literal character. int fieldLength = fieldData.length(); if ((fieldLength -1-1) >=0) { fieldData.deleteCharAt(fieldLength-1-1); } } } } } if (!escapingQuote) { if (!startQuote) { startQuote = true; } else { startQuote = false; } } } //found a delimiter if (delimiterStorage.indexOf(fieldDelimiter) != -1 && !startQuote) { //check if there is literal escape character before the delimiter, //if there is, then we should skip this delimiter int indexOfCharBeforeDelimiter = i - delimiterLength; boolean escapeDelimiter = false; if (literaled && indexOfCharBeforeDelimiter >= 0) { char charBeforeDelimiter = oneRowData.charAt(indexOfCharBeforeDelimiter); ////there is a literal character before delimiter we should skip this demlimiter if (charBeforeDelimiter == literal) { if (!includeLiteralCharacter) { //if we don't want literal character in the data, //we should delete literal character. int fieldLength = fieldData.length(); if ((fieldLength - delimiterLength -1) >=0) { fieldData.deleteCharAt(fieldLength-delimiterLength-1); } } escapeDelimiter = true; continue; } } // check if the delimiter is in the end of the string if (i == (length-1) && !startQuote && !escapeDelimiter) { delimiterAtEnd = true; } ////here we should treat sequential delimiter as single delimiter if (collapseDelimiters) { priviousDelimiterIndex = currentDelimiterIndex; currentDelimiterIndex = i; //there is nothing between two delimiter, should skip it. if ((currentDelimiterIndex - priviousDelimiterIndex) == delimiterLength) { //delete sequnced delimiter fieldData = new StringBuffer(); continue; } } String value =""; int delimiterIndex = fieldData.lastIndexOf(fieldDelimiter); if (delimiterIndex ==0) { //this path means field data on has delimiter, no real data value =""; } else { value = fieldData.substring(0, delimiterIndex); } elementsVector.add(value); //reset string buffer fieldData fieldData = new StringBuffer(); } } // if startQuote is true at the end, which means there is no close quote character in this row, // code should throw an exception if (startQuote) { throw new Exception("There is a un-closed quote in data file"); } // add last field. If this string end of delimiter, we need add a "" // else, we need to add the value in string buffer. String lastFieldValue = null; if (delimiterAtEnd == true) { //this path means field data on has delimiter, no real data lastFieldValue =""; } else { lastFieldValue = fieldData.toString(); } elementsVector.add(lastFieldValue); //transform vector to string array int size = elementsVector.size(); elements = new String[size]; for (int i=0; i 1) && (element.charAt(0) == quote) && (element.charAt(len-1) == quote) ) { String newElement = element.substring(1, len-1); elements[i] = newElement; } } return elements; } /* * This method will delete the most left char in the given buffer, * and append the new char at the end. So the buffer size will * stay the same. */ private static StringBuffer shiftBuffer(StringBuffer buffer, char newChar) { StringBuffer newBuffer = new StringBuffer(); if (buffer == null) { return newBuffer; } int size = buffer.length(); for (int i=0; i0) { newBuffer.append(oldChar); } } newBuffer.append(newChar); return newBuffer; } /* * If quote character is specified by hex number, we should transform it * to a character. If quote string is longer than 1 character, * throw an exception. */ private String transformQuoteCharacter(String quote) throws Exception { String newQuote = quote; if (newQuote == null) { return newQuote; } else if (newQuote.startsWith("#") && newQuote.length() > 1) { String digits = newQuote.substring(1, newQuote.length()); int radix = 10; if (digits.startsWith("x")) { radix = 16; digits = digits.substring(1, digits.length()); } newQuote = transformDigitsToCharString(radix, digits); } else if ((newQuote.startsWith("0x") || newQuote.startsWith("0X")) && newQuote.length() >2) { int radix = 16; String digits = newQuote.substring(2, newQuote.length()); newQuote = transformDigitsToCharString(radix, digits); } if (newQuote.length() > 1) { throw new Exception("Quote string length should be 1 character in EML"); } return newQuote; } /** * Returns the data as an array of vectors. Each vector will have the same * number of elements as there are columns in the data. * * @param stripHeaderLines true if the header lines should not be included * in the returned data, false otherwise */ public Vector[] getTokenizedData(boolean stripHeaderLines) { if(stripHeaderLines) { Vector[] strip = null; if (numRecords > numHeaderLines) { strip = new Vector[numRecords-numHeaderLines]; for(int i=numHeaderLines; i