modules/csv1.js

/**
 * @license
 * Licensed under the <a href="http://www.opensource.org/licenses/mit-license.php">MIT license</a>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * Author Greg Kindel (twitter @gkindel), 2013
 */
 /**
 * CSV-JS - A Comma-Separated Values parser for JS
 *
 * Built to rfc4180 standard, with options for adjusting strictness:
 *
 *    - optional carriage returns for non-microsoft sources
 *    - automatically type-cast numeric an boolean values
 *    - relaxed mode which: ignores blank lines, ignores gargabe following quoted tokens, does not enforce a consistent record length
 *
 * Adopted for UnityBase by pavel.mash
 *
 * Usage sample:
 *
 *       var CSV = require('csv1');
 *       // simple
 *       var rows = CSV.parse('one,two,three\nfour,five,six')
 *       // rows equals [["one","two","three"],["four","five","six"]]
 *       // or read from file system
 *       var fs = require('fs'), f = fs.readFileSync('c:/csv.txt');
 *       var rows = CSV.parse(f);
 *       for( var i =0; i < rows.length; i++){
 *          console.log(rows[i]);
 *       }
 *
 * @module csv1
 */

var QUOTE = "\"",
    CR = "\r",
    LF = "\n",
    COMMA = ";",
    SPACE = " ",
    TAB = "\t";


// implemented as a singleton because JS is single threaded
var CSV = {};
CSV.RELAXED = false;
CSV.IGNORE_RECORD_LENGTH = false;
CSV.IGNORE_QUOTES = false;
CSV.LINE_FEED_OK = true;
CSV.CARRIAGE_RETURN_OK = true;
CSV.DETECT_TYPES = true;
CSV.IGNORE_QUOTE_WHITESPACE = true;
CSV.DEBUG = false;
CSV.QUOTE = QUOTE;
CSV.COMMA = COMMA;

CSV.ERROR_EOF = "UNEXPECTED_END_OF_FILE";
CSV.ERROR_CHAR = "UNEXPECTED_CHARACTER";
CSV.ERROR_EOL = "UNEXPECTED_END_OF_RECORD";
CSV.WARN_SPACE = "UNEXPECTED_WHITESPACE"; // not per spec, but helps debugging

// states
var PRE_TOKEN = 0,
    MID_TOKEN = 1,
    POST_TOKEN = 2,
    POST_RECORD = 4;
/**
 * <a href="http://www.ietf.org/rfc/rfc4180.txt">rfc4180</a> standard csv parse
 * with options for strictness and data type conversion
 * By default, will automatically type-cast numeric an boolean values.
 *
 * @method parse
 * @param {String} str A CSV string
 * @param {String} [comma=";"] column separator
 * @return {Array} An array records, each of which is an array of scalar values.
 */
CSV.parse = function (str, comma) {
    if (comma) {
        CSV.COMMA = comma;
    }
    var result = CSV.result = [];
    CSV.offset = 0;
    CSV.str = str;
    CSV.record_begin();

    CSV.debug("parse()", str);

    var c;
    while( 1 ){
        // pull char
        c = str[CSV.offset++];
        CSV.debug("c", c);

        // detect eof
        if (c == null) {
            if( CSV.escaped )
                CSV.error(CSV.ERROR_EOF);

            if( CSV.record ){
                CSV.token_end();
                CSV.record_end();
            }

            CSV.debug("...bail", c, CSV.state, CSV.record);
            CSV.reset();
            break;
        }

        if( CSV.record == null ){
            // if relaxed mode, ignore blank lines
            if( CSV.RELAXED && (c == LF || c == CR && str[CSV.offset + 1] == LF) ){
                continue;
            }
            CSV.record_begin();
        }

        // pre-token: look for start of escape sequence
        if (CSV.state == PRE_TOKEN) {

            if ((c === SPACE || c === TAB) && CSV.next_nonspace() == CSV.QUOTE) {
                if( CSV.RELAXED || CSV.IGNORE_QUOTE_WHITESPACE ) {
                    continue;
                }
                else {
                    // not technically an error, but ambiguous and hard to debug otherwise
                    CSV.warn(CSV.WARN_SPACE);
                }
            }

            if (c == CSV.QUOTE && !CSV.IGNORE_QUOTES) {
                CSV.debug("...escaped start", c);
                CSV.escaped = true;
                CSV.state = MID_TOKEN;
                continue;
            }
            CSV.state = MID_TOKEN;
        }

        // mid-token and escaped, look for sequences and end quote
        if (CSV.state == MID_TOKEN && CSV.escaped) {
            if (c == CSV.QUOTE) {
                if (str[CSV.offset] == CSV.QUOTE) {
                    CSV.debug("...escaped quote", c);
                    CSV.token += CSV.QUOTE;
                    CSV.offset++;
                }
                else {
                    CSV.debug("...escaped end", c);
                    CSV.escaped = false;
                    CSV.token_escaped = true;
                    CSV.state = POST_TOKEN;
                }
            }
            else {
                CSV.token += c;
                CSV.debug("...escaped add", c, CSV.token);
            }
            continue;
        }

        // fall-through: mid-token or post-token, not escaped
        if (c == CR ) {
            if( str[CSV.offset] == LF  )
                CSV.offset++;
            else if( ! CSV.CARRIAGE_RETURN_OK )
                CSV.error(CSV.ERROR_CHAR);
            CSV.token_end();
            CSV.record_end();
        }
        else if (c == LF) {
            if( ! (CSV.LINE_FEED_OK || CSV.RELAXED) )
                CSV.error(CSV.ERROR_CHAR);
            CSV.token_end();
            CSV.record_end();
        }
        else if (c == CSV.COMMA) {
            CSV.token_end();
        }
        else if( CSV.state == MID_TOKEN ){
            CSV.token += c;
            CSV.debug("...add", c, CSV.token);
        }
        else if ( c === SPACE || c === TAB) {
            if (! CSV.IGNORE_QUOTE_WHITESPACE )
                CSV.error(CSV.WARN_SPACE );
        }
        else if( ! CSV.RELAXED ){
            CSV.error(CSV.ERROR_CHAR);
        }
    }
    return result;
};

CSV.reset = function () {
    CSV.state = null;
    CSV.token = null;
    CSV.escaped = null;
    CSV.record = null;
    CSV.offset = null;
    CSV.result = null;
    CSV.str = null;
};

CSV.next_nonspace = function () {
    var i = CSV.offset;
    var c;
    while( i < CSV.str.length ) {
        c = CSV.str[i++];
        if( !( c == SPACE || c === TAB ) ){
            return c;
        }
    }
    return null;
};

CSV.record_begin = function () {
    CSV.escaped = false;
    CSV.record = [];
    CSV.token_begin();
    CSV.debug("record_begin");
};

CSV.record_end = function () {
    CSV.state = POST_RECORD;
    if( ! (CSV.IGNORE_RECORD_LENGTH || CSV.RELAXED)
        && CSV.result.length > 0 && CSV.record.length !=  CSV.result[0].length ){
        CSV.error(CSV.ERROR_EOL);
    }
    CSV.result.push(CSV.record);
    CSV.debug("record end", CSV.record);
    CSV.record = null;
};

CSV.resolve_type = function (token) {
    if( token.match(/^\d+(\.\d+)?$/) ){
        token = parseFloat(token);
    }
    else if( token.match(/^true|false$/i) ){
        token = Boolean( token.match(/true/i) );
    }
    else if(token === "undefined" ){
        token = undefined;
    }
    else if(token === "null" ){
        token = null;
    }
    return token;
};

CSV.token_begin = function () {
    CSV.state = PRE_TOKEN;
    // considered using array, but http://www.sitepen.com/blog/2008/05/09/string-performance-an-analysis/
    CSV.token = "";
};

CSV.token_end = function () {
    if (CSV.DETECT_TYPES && !CSV.token_escaped) {
        CSV.token = CSV.resolve_type(CSV.token);
    }
    CSV.token_escaped = false;
    CSV.record.push(CSV.token);
    CSV.debug("token end", CSV.token);
    CSV.token_begin();
};

CSV.debug = function (){
    if( CSV.DEBUG )
        console.log(arguments);
};

CSV.dump = function (msg) {
    return [
        msg , "at char", CSV.offset, ":",
        CSV.str.substr(CSV.offset- 50, 50)
            .replace(/\r/mg,"\\r")
            .replace(/\n/mg,"\\n")
            .replace(/\t/mg,"\\t")
    ].join(" ");
};

CSV.error = function (err){
    var msg = CSV.dump(err);
    CSV.reset();
    throw msg;
};

CSV.warn = function (err){
    var msg = CSV.dump(err);
    try {
        console.warn( msg );
        return;
    } catch (e) {}

    try {
        console.log( msg );
    } catch (e) {}

};

module.exports = CSV;