/**
* @license
* Licensed under the <a href="http://www.opensource.org/licenses/mit-license.php">MIT license</a>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* Author Greg Kindel (twitter @gkindel), 2013
*/
/**
* CSV-JS - A Comma-Separated Values parser for JS
*
* Built to rfc4180 standard, with options for adjusting strictness:
*
* - optional carriage returns for non-microsoft sources
* - automatically type-cast numeric an boolean values
* - relaxed mode which: ignores blank lines, ignores gargabe following quoted tokens, does not enforce a consistent record length
*
* Adopted for UnityBase by pavel.mash
*
* Usage sample:
*
* var csv = require('@unitybase/base').csv;
* // simple
* var rows = csv.parse('one,two,three\nfour,five,six')
* // rows equals [["one","two","three"],["four","five","six"]]
* // or read from file system
* var fs = require('fs'), f = fs.readFileSync('c:/csv.txt');
* var rows = csv.parse(f);
* for( var i =0; i < rows.length; i++){
* console.log(rows[i]);
* }
*
* @module @unitybase/base/csv1
*/
var QUOTE = '"',
CR = '\r',
LF = '\n',
COMMA = ';',
SPACE = ' ',
TAB = '\t'
// implemented as a singleton because JS is single threaded
var CSV = {}
CSV.RELAXED = false
CSV.IGNORE_RECORD_LENGTH = false
CSV.IGNORE_QUOTES = false
CSV.LINE_FEED_OK = true
CSV.CARRIAGE_RETURN_OK = true
CSV.DETECT_TYPES = true
CSV.IGNORE_QUOTE_WHITESPACE = true
CSV.DEBUG = false
CSV.QUOTE = QUOTE
CSV.COMMA = COMMA
CSV.ERROR_EOF = 'UNEXPECTED_END_OF_FILE'
CSV.ERROR_CHAR = 'UNEXPECTED_CHARACTER'
CSV.ERROR_EOL = 'UNEXPECTED_END_OF_RECORD'
CSV.WARN_SPACE = 'UNEXPECTED_WHITESPACE' // not per spec, but helps debugging
// states
var PRE_TOKEN = 0,
MID_TOKEN = 1,
POST_TOKEN = 2,
POST_RECORD = 4
/**
* <a href="http://www.ietf.org/rfc/rfc4180.txt">rfc4180</a> standard csv parse
* with options for strictness and data type conversion
* By default, will automatically type-cast numeric an boolean values.
*
* @method parse
* @param {String} str A CSV string
* @param {String} [comma=";"] column separator
* @return {Array} An array records, each of which is an array of scalar values.
*/
CSV.parse = function (str, comma) {
if (comma) {
CSV.COMMA = comma
}
var result = CSV.result = []
CSV.offset = 0
CSV.str = str
CSV.record_begin()
CSV.debug('parse()', str)
var c
while (1) {
// pull char
c = str[CSV.offset++]
CSV.debug('c', c)
// detect eof
if (c == null) {
if (CSV.escaped)
{ CSV.error(CSV.ERROR_EOF) }
if (CSV.record) {
CSV.token_end()
CSV.record_end()
}
CSV.debug('...bail', c, CSV.state, CSV.record)
CSV.reset()
break
}
if (CSV.record == null) {
// if relaxed mode, ignore blank lines
if (CSV.RELAXED && (c == LF || c == CR && str[CSV.offset + 1] == LF)) {
continue
}
CSV.record_begin()
}
// pre-token: look for start of escape sequence
if (CSV.state == PRE_TOKEN) {
if ((c === SPACE || c === TAB) && CSV.next_nonspace() == CSV.QUOTE) {
if (CSV.RELAXED || CSV.IGNORE_QUOTE_WHITESPACE) {
continue
}
else {
// not technically an error, but ambiguous and hard to debug otherwise
CSV.warn(CSV.WARN_SPACE)
}
}
if (c == CSV.QUOTE && !CSV.IGNORE_QUOTES) {
CSV.debug('...escaped start', c)
CSV.escaped = true
CSV.state = MID_TOKEN
continue
}
CSV.state = MID_TOKEN
}
// mid-token and escaped, look for sequences and end quote
if (CSV.state == MID_TOKEN && CSV.escaped) {
if (c == CSV.QUOTE) {
if (str[CSV.offset] == CSV.QUOTE) {
CSV.debug('...escaped quote', c)
CSV.token += CSV.QUOTE
CSV.offset++
}
else {
CSV.debug('...escaped end', c)
CSV.escaped = false
CSV.token_escaped = true
CSV.state = POST_TOKEN
}
}
else {
CSV.token += c
CSV.debug('...escaped add', c, CSV.token)
}
continue
}
// fall-through: mid-token or post-token, not escaped
if (c == CR) {
if (str[CSV.offset] == LF)
{ CSV.offset++ }
else if (!CSV.CARRIAGE_RETURN_OK)
{ CSV.error(CSV.ERROR_CHAR) }
CSV.token_end()
CSV.record_end()
}
else if (c == LF) {
if (!(CSV.LINE_FEED_OK || CSV.RELAXED))
{ CSV.error(CSV.ERROR_CHAR) }
CSV.token_end()
CSV.record_end()
}
else if (c == CSV.COMMA) {
CSV.token_end()
}
else if (CSV.state == MID_TOKEN) {
CSV.token += c
CSV.debug('...add', c, CSV.token)
}
else if (c === SPACE || c === TAB) {
if (!CSV.IGNORE_QUOTE_WHITESPACE)
{ CSV.error(CSV.WARN_SPACE) }
}
else if (!CSV.RELAXED) {
CSV.error(CSV.ERROR_CHAR)
}
}
return result
}
CSV.reset = function () {
CSV.state = null
CSV.token = null
CSV.escaped = null
CSV.record = null
CSV.offset = null
CSV.result = null
CSV.str = null
}
CSV.next_nonspace = function () {
var i = CSV.offset
var c
while (i < CSV.str.length) {
c = CSV.str[i++]
if (!(c == SPACE || c === TAB)) {
return c
}
}
return null
}
CSV.record_begin = function () {
CSV.escaped = false
CSV.record = []
CSV.token_begin()
CSV.debug('record_begin')
}
CSV.record_end = function () {
CSV.state = POST_RECORD
if (!(CSV.IGNORE_RECORD_LENGTH || CSV.RELAXED)
&& CSV.result.length > 0 && CSV.record.length != CSV.result[0].length) {
CSV.error(CSV.ERROR_EOL)
}
CSV.result.push(CSV.record)
CSV.debug('record end', CSV.record)
CSV.record = null
}
CSV.resolve_type = function (token) {
if (token.match(/^\d+(\.\d+)?$/)) {
token = parseFloat(token)
}
else if (token.match(/^true|false$/i)) {
token = Boolean(token.match(/true/i))
}
else if (token === 'undefined') {
token = undefined
}
else if (token === 'null') {
token = null
}
return token
}
CSV.token_begin = function () {
CSV.state = PRE_TOKEN
// considered using array, but http://www.sitepen.com/blog/2008/05/09/string-performance-an-analysis/
CSV.token = ''
}
CSV.token_end = function () {
if (CSV.DETECT_TYPES && !CSV.token_escaped) {
CSV.token = CSV.resolve_type(CSV.token)
}
CSV.token_escaped = false
CSV.record.push(CSV.token)
CSV.debug('token end', CSV.token)
CSV.token_begin()
}
CSV.debug = function () {
if (CSV.DEBUG)
{ console.log(arguments) }
}
CSV.dump = function (msg) {
return [
msg, 'at char', CSV.offset, ':',
CSV.str.substr(CSV.offset - 50, 50)
.replace(/\r/mg, '\\r')
.replace(/\n/mg, '\\n')
.replace(/\t/mg, '\\t')
].join(' ')
}
CSV.error = function (err) {
var msg = CSV.dump(err)
CSV.reset()
throw msg
}
CSV.warn = function (err) {
var msg = CSV.dump(err)
try {
console.warn(msg)
return
} catch (e) {}
try {
console.log(msg)
} catch (e) {}
}
module.exports = CSV