erlang中的csv解析器

4si2a6ki  于 2022-12-08  发布在  Erlang
关注(0)|答案(8)|浏览(263)

for my application i have to parse CSV file using Erlang.following is the code which will parse CSV using Erlang:-

parse_file(Fn) ->
{ok, Data} = file:read_file(Fn),
parse(binary_to_list(Data)).

parse(Data) -> lists:reverse(parse(Data, [])).

parse([], Acc) -> Acc;
parse(Data, Acc) ->
{Line, Tail} = parse_line(Data),
parse(Tail, [Line|Acc]).

parse_line(Data) ->
{Line, Tail} = parse_line(Data, []),
{lists:reverse(Line), Tail}.

parse_line([13,10|Data], Acc) -> {Acc, Data};
parse_line([10|Data], Acc) -> {Acc, Data};
parse_line([13|Data], Acc) -> {Acc, Data};
parse_line([], Acc) -> {Acc, []};
parse_line([$,,$,|Data], Acc) -> parse_line(Data, [""|Acc]);
parse_line([$,|Data], Acc) -> parse_line(Data, Acc);
parse_line(Data, Acc) ->
{Fld, Tail} = parse_field(Data),
parse_line(Tail, [Fld|Acc]).

parse_field([34|Data]) ->
{Fld, Tail} = parse_fieldq(Data, ""),
{lists:reverse(Fld), Tail};
parse_field(Data) ->
{Fld, Tail} = parse_field(Data, ""),
{lists:reverse(Fld), Tail}.

parse_field([$,|Tail], Acc) -> {Acc, [$,|Tail]};
parse_field([13|Tail], Acc) -> {Acc, [13|Tail]};
parse_field([10|Tail], Acc) -> {Acc, [10|Tail]};
parse_field([], Acc) -> {Acc, []};
parse_field([Ch|Tail], Acc) -> parse_field(Tail, [Ch|Acc]).

parse_fieldq([34,34|Tail], Acc) -> parse_fieldq(Tail, [34|Acc]);
parse_fieldq([34|Tail], Acc) -> {Acc, Tail};
parse_fieldq([Ch|Tail], Acc) -> parse_fieldq(Tail, [Ch|Acc]).

this code works fine but having two issues:- 1-since the code parse using double quote ("") and comma(,) and separate each value..but in following example if First name consist of double quote sting within it then the parser will create one more field.

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------"All Pain Will End."","","itisashwani4u@gmail.com"

result:-
[["contact"],["Ashwani  Garg ------"],["All Pain Will End."],[],["itisashwani4u@gmail.com"]]

expected result:-
[["contact"],["Ashwani  Garg ------All Pain Will End."],[],["itisashwani4u@gmail.com"]]

2-for the following kind of csv its for value,its truncate some value:- First Name,Last Name,Middle Name,Name,Nickname,E-mail Address,Home Street,Home City,Home Postal Code,Home State,Home Country/Region,Home Phone,Home Fax,Mobile Phone,Personal Web Page,Business Street,Business City,Business Postal Code,Business State,Business Country/Region,Business Web Page,Business Phone,Business Fax,Pager,Company,Job Title,Department,Office Location,Notes

Affection,,,Affection,,,,,,,,+919845141544,,+919845141544,,,,,,,,,,,,,,,
    result:-
    [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[]]
    expected result:-
   [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

Please help me ...for refernce please use the following link:- http://ppolv.wordpress.com/2008/02/25/parsing-csv-in-erlang/

czq61nw1

czq61nw11#

parse(File) ->
  {ok, F} = file:open(File, [read, raw]),
  parse(F, file:read_line(F), []).

parse(F, eof, Done) ->
  file:close(F),
  lists:reverse(Done);    

parse(F, Line, Done) ->
  parse(F, file:read_line(F), [parse_line(Line)|Done]).


parse_line(Line) -> parse_line(Line, []).

parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).

parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).

parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).

parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [lists:reverse(Buf)|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).

without file:read_line :

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(binary_to_list(Data), []).

parse([], Done) ->
  lists:reverse(Done);

parse(Data, Done) ->
  {Line, Rest} = case re:split(Data, "\r|\n|\r\n", [{return, list}, {parts, 2}]) of
                   [L,R] -> {L,R};
                   [L]   -> {L,[]}
                 end,
  parse(Rest, [parse_line(Line)|Done]).
ilmyapht

ilmyapht2#

附带问题:
您是如何创建CSV输入的?它看起来不是有效的CSV(虽然CSV没有特别严格的规范)。
通常,要在CSV字段中使用双引号,需要将它们转义为一对双引号,因此您的示例如下:

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------""All Pain Will End.""","","itisashwani4u@gmail.com"

这将很好地导入到Open Office电子表格中,而您的原始示例却不能。

slwdgvem

slwdgvem3#

I came across your implementation the other day and started playing around with it.
I made you a parser as well.

-module(csv_parser).

-export([parse_file/1]).

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(Data).

parse(Data) ->
    Lines = re:split(Data, "\r|\n|\r\n", [] ), 
    [ [begin
           case  re:split(Token, "\"", [] ) of 
               [_,T,_] -> T;
               [T] -> T; % if token is not surrounded by ""
               [] -> <<"">>
           end
       end || Token <- re:split(Line, ",", [] ) ] || Line <- Lines, Line =/= <<"">>].

I even wrote a small blogpost on this csv parser

44u64gxh

44u64gxh4#

Trapexit中也讨论了从文件中阅读行。根据您的需要进行修改应该是很简单的:
http://www.trapexit.org/Reading_Lines_from_a_File

qjp7pelc

qjp7pelc5#

我的实现:

-module(csv).

-export([
    parse/1
]).

parse(File) ->
    try
        {ok, Bin} = file:read_file(File),
        {ok, parse(binary_to_list(Bin), [], [], [])}
    catch
        Class:Error ->
            {Class, Error}
    end.

parse([], _FBuff, _RBuff, Result) ->
    lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
    {F, Rest1} = parse_q(Rest, []),
    parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);    
parse([$, | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], _FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([$\n | Rest], _FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [A | FBuff], RBuff, Result).

parse_q([$", $, | Rest], Result) ->
    {lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
    parse_q(Rest, [A | Result]).

但是,这种解决方案不能处理嵌套引号...
例如:
1,“您好,“World”",“她说:“这是“解决方案”,不是吗?"",2000\r\n

ijnw1ujt

ijnw1ujt6#

另一个可能的解决方案。可以很容易地改为懒惰求值,这样就不需要一次阅读整个文件。

parse(Data) -> parse(Data, [], [], []).

parse([$\r|Data], Field, Fields, Lines) -> parse_r(Data, Field, Fields, Lines);
parse([$\n|Data], Field, Fields, Lines) -> parse(Data, [], [], [[Field|Fields]|Lines]);
parse([$,|Data], Field, Fields, Lines)  -> parse(Data, [], [Field|Fields], Lines);
parse([$"|Data], [], Fields, Lines)     -> parse_q(Data, [], Fields, Lines);
parse([C|Data], Field, Fields, Lines)   -> parse(Data, [C|Field], Fields, Lines);
parse([], Field, Fields, Lines)         -> 
  lists:reverse(
      [lists:reverse(
          [lists:reverse(F) || F <- L]
        ) || L <- [[Field|Fields]|Lines]]
    ).

parse_r([$\n|_] = Data, Field, Fields, Lines) -> parse(Data, Field, Fields, Lines).

parse_q([$"|Data], Field, Fields, Lines) -> parse_qq(Data, Field, Fields, Lines);
parse_q([C|Data], Field, Fields, Lines)  -> parse_q(Data, [C|Field], Fields, Lines).

parse_qq([$"|Data], Field, Fields, Lines) -> parse_q(Data, [$"|Field], Fields, Lines);
parse_qq([C|_] = Data, Field, Fields, Lines)  
  when C == $,; C == $\r; C == $\n        -> parse(Data, Field, Fields, Lines);
parse_qq([], Field, Fields, Lines)        -> parse([], Field, Fields, Lines).
2uluyalo

2uluyalo7#

我在zed的答案中添加了一些增强功能。

-module (helper_csv_parser).
-compile(export_all).

% Taken from http://stackoverflow.com/questions/1532081/csv-parser-in-erlang, modified to fix errors.
parse(File) ->
    {ok, F} = file:open(File, [read, {encoding, utf8}]),
    {ok, L} = file:read_line(F),
    parse(F, string:strip(L, right, $\n), [], 1).

parse(F, eof, Done, _) ->
    file:close(F),
    lists:reverse(Done);

parse(F, Line, Done, Ctr) ->
    Res = file:read_line(F),

    case Res of
        {error,collect_line} -> throw({error, "Might be unicode at line " ++ helper:i2s(Ctr)});
        {ok, L} -> parse(F, string:strip(L, right, $\n),[parse_line(Line)|Done], Ctr+1);
        eof -> parse(F,eof,[parse_line(Line)|Done], Ctr+1)
    end.

parse_line("," ++ Line) -> parse_line(Line, [[]]);
parse_line(Line) -> parse_line(Line, []).

parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).

parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).

parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).

parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).
ycggw6v2

ycggw6v28#

Wisher的答案还不错,只是它丢失了每一行csv的最后一个元素。这里有一个修复程序。尽管它仍然不能处理嵌入的引号。

-module(csv).

-export([read/1]).

read(File) ->
    try
        {ok, Bin} = file:read_file(File),
        {ok, parse(binary_to_list(Bin), [], [], [])}
    catch
        Class:Error ->
            {Class, Error}
    end.

parse([], _FBuff, _RBuff, Result) ->
    lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
    {F, Rest1} = parse_q(Rest, []),
    parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$, | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([$\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [A | FBuff], RBuff, Result).

parse_q([$", $, | Rest], Result) ->
    {lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
    parse_q(Rest, [A | Result]).

相关问题