根據協議 http://www.rfc-editor.org/rfc/rfc4180.txt 開發
CSV 語法的規則
file = [header CRLF] record *(CRLF record) [CRLF] header = name *(COMMA name) record = field *(COMMA field) name = field field = (escaped / non-escaped) escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE non-escaped = *TEXTDATA COMMA = %x2C CR = %x0D ;as per section 6.1 of RFC 2234 [2] DQUOTE = %x22 ;as per section 6.1 of RFC 2234 [2] LF = %x0A ;as per section 6.1 of RFC 2234 [2] CRLF = CR LF ;as per section 6.1 of RFC 2234 [2] TEXTDATA = %x20-21 / %x23-2B / %x2D-7E
CSV 解析有限狀態機
編碼實現
出於降低代碼圈復雜度的考慮,INIT 狀態與 FIELD_END 狀態合並,兩個狀態唯一的區別是:
當前行數據為空時,INIT 狀態可以在解析的過程中識別出, FIELD_END 則不能解析時識別。
針對該問題,可以在解析前判斷當前行是否為空。不影響代碼邏輯與性能。
引包如下:
#include <fstream> #include <iostream> #include <vector> #include <string>
有限狀態機實現代碼:
typedef enum CSV_RECORD_STATUS { // CSV_RECORD_STATUS_INIT = 1001, // use CSV_RECORD_STATUS_FIELD_END instead CSV_RECORD_STATUS_ESCAPED_PRE = 1002, CSV_RECORD_STATUS_ESCAPED_SUB, CSV_RECORD_STATUS_ESCAPED_FIELD, CSV_RECORD_STATUS_NON_ESCAPED_FIELD, CSV_RECORD_STATUS_FIELD_END, CSV_RECORD_STATUS_ERROR_BUTT }CSV_RECORD_STATUS; int TransferNonEscapedField(char ch) { switch ( ch ) { case DELIMITER: return CSV_RECORD_STATUS_FIELD_END; case QUOTE: return CSV_RECORD_STATUS_ERROR_BUTT; default: return CSV_RECORD_STATUS_NON_ESCAPED_FIELD; } } int TransferEscapedField(char ch) { switch ( ch ) { case QUOTE: return CSV_RECORD_STATUS_ESCAPED_SUB; default: return CSV_RECORD_STATUS_ESCAPED_FIELD; } } int TransferEscapedPre(char ch) { switch ( ch ) { case QUOTE: return CSV_RECORD_STATUS_ESCAPED_SUB; default: return CSV_RECORD_STATUS_ESCAPED_FIELD; } } int TransferEscapedSub(char ch) { switch ( ch ) { case DELIMITER: return CSV_RECORD_STATUS_FIELD_END; case QUOTE: return CSV_RECORD_STATUS_ESCAPED_FIELD; default: return CSV_RECORD_STATUS_ERROR_BUTT; } } int TransferFieldEnd(char ch) { switch ( ch ) { case QUOTE: return CSV_RECORD_STATUS_ESCAPED_PRE; case DELIMITER: return CSV_RECORD_STATUS_FIELD_END; default: return CSV_RECORD_STATUS_NON_ESCAPED_FIELD; } } int RecordStatus(int preStatus, char ch) { switch ( preStatus ) { //case CSV_RECORD_STATUS_INIT: // return TransferInit(ch); // Field case CSV_RECORD_STATUS_NON_ESCAPED_FIELD: return TransferNonEscapedField(ch); case CSV_RECORD_STATUS_ESCAPED_FIELD: return TransferEscapedField(ch); // Quote in escaped filed case CSV_RECORD_STATUS_ESCAPED_PRE: return TransferEscapedPre(ch); case CSV_RECORD_STATUS_ESCAPED_SUB: return TransferEscapedSub(ch); // Field End case CSV_RECORD_STATUS_FIELD_END: return TransferFieldEnd(ch); // Error default: return CSV_RECORD_STATUS_ERROR_BUTT; } }
使用有限狀態機解析 CSV:
const char DELIMITER = ','; // COMMA const char QUOTE = '"'; bool hasError(int status) { return (CSV_RECORD_STATUS_ERROR_BUTT == status); } bool isTextData(int status) { return (CSV_RECORD_STATUS_ESCAPED_FIELD == status) || (CSV_RECORD_STATUS_NON_ESCAPED_FIELD == status); } bool isFieldEnd(int status) { return (CSV_RECORD_STATUS_FIELD_END == status); } void FieldEnd(std::vector<std::string>& record, std::string& field) { //std::cout << field << " || "; record.push_back(field); field.clear(); } void RecordHandler(std::vector<std::string>& record, int sequence) { //if ( 0 == (sequence % 10000) ) { std::cout << std::endl << "Records: " << sequence << std::endl; } record.clear(); } bool LastFieldEnd(int status, std::vector<std::string>& record, std::string& field) { // error check if ( (CSV_RECORD_STATUS_ESCAPED_SUB != status) && (CSV_RECORD_STATUS_NON_ESCAPED_FIELD != status) ) { return false; } FieldEnd(record, field); return true; } int CCsvParser::Parse(std::string filename) { std::ifstream fin; fin.open(filename, std::ios::in); if (!fin.is_open()) { return false; } std::string header; std::getline(fin, header); // TODO: deal with header int recordCount = 0; // count without header std::string lineData; std::vector<std::string> record; std::string field; bool newField = true; while(std::getline(fin, lineData)) { recordCount++; int curStatus = CSV_RECORD_STATUS_FIELD_END; record.clear(); field.clear(); // parser line to record // std::cout << lineData << std::endl; for (size_t i =0; i < lineData.size(); i++) { curStatus = RecordStatus(curStatus, lineData[i]); if (hasError(curStatus)) { break; // next line } else if (isTextData(curStatus)) { field.push_back(lineData[i]); } else if (isFieldEnd(curStatus)) { FieldEnd(record, field); } } // line end, deal with last field LastFieldEnd(curStatus, record, field); RecordHandler(record, recordCount); } fin.close(); return recordCount; }