I am beginning Erlang and as an exercise I tried to implement the CYK algorithm .
Main code(cyk.erl):
%%% An attempt for a CYK parser in Erlang
-module(cyk).
-export([
init_storage/0,
import_grammar_file/1,
add_grammar_rule/1,
analyze/1,
test_analyze/0
]).
%% Initialize the ets storage for grammar
init_storage() ->
ets:new(?MODULE, [bag, named_table]).
%%------------------------------------------
%%
%% Grammar
%%
%%------------------------------------------
%% Import a grammar file
import_grammar_file(File) ->
{ok, Device} = file:open(File, read),
import_file_rules(Device).
%% Import all the rules in the file
import_file_rules(Device) ->
case io:get_line(Device, "") of
eof ->
io:format("Grammar file imported~n"),
file:close(Device);
Line ->
add_grammar_rule(Line),
import_file_rules(Device)
end.
%% Add a grammar rule
add_grammar_rule(Rule) ->
case re:run(Rule, "^([^\s]+)\s?->\s?([^\n]+)$", [{capture, all_but_first, binary}]) of
{match, [A, B]} ->
ets:insert(?MODULE, {A, B}),
io:format("parsing ~p -> ~p~n", [A, B]);
nomatch ->
io:format("cannot parse ~p~n", [Rule])
end.
%%------------------------------------------
%%
%% Main logic
%%
%%------------------------------------------
%% Analyze a sentence
analyze(Sentence) ->
io:format("analysing: ~p~n", [Sentence]),
WordList = re:split(Sentence, " "),
io:format("wordlist: ~p~n", [WordList]),
Representation = lists:map( fun(Word) -> associate(Word) end, WordList),
io:format("representation: ~p~n", [Representation]),
Result = process([Representation]),
io:format("result: ~p~n", [Result]).
% associate sentence words with grammar terms
associate(Word) ->
case ets:match(cyk, {'$1', Word}) of
[H|T] -> lists:flatten([H|T]);
[] -> []
end.
% process sentence representation
process(Representation) ->
Limit = length(lists:last(Representation)),
process(Representation, Limit).
process(Representation, Limit) when Limit > 1 ->
NextStep = process(Representation, 1, Limit-1, []),
process([NextStep|Representation], Limit-1);
process(Representation, _Limit) ->
Representation.
process(Representation, Index, Limit, Acc) when Index =< Limit ->
Subtree = extract_subtree(lists:reverse(Representation), Index),
Result = process_subtree(Subtree),
process(Representation, Index+1, Limit, [Result|Acc]);
process(_Representation, _Index, _Limit, Acc) ->
lists:reverse(Acc).
%%------------------------------------------
%%
%% Subtree
%%
%%------------------------------------------
process_subtree(Subtree) ->
process_subtree(Subtree, Subtree, [], 1).
process_subtree([], _Subtree, Acc, _Index) ->
Acc;
process_subtree([H|T], Subtree, Acc, Index) ->
A = lists:nth(1,H),
Bind = length( Subtree ) - Index + 1,
B = lists:last( lists:nth( Bind, Subtree) ),
% generating the possibilities of grammar
Pos = [ list_to_binary(binary:bin_to_list(X)++" "++binary:bin_to_list(Y)) || X<-A, Y<-B ],
% looking up in the grammar
Result = lists:flatten( [ ets:match(cyk, {'$1', X}) || X <- Pos ] ),
process_subtree(T, Subtree, Acc++Result, Index + 1).
%% Extract a subtree from the representation
extract_subtree(Representation, Position) ->
Size = length(Representation) + 1,
extract_subtree(Representation, Size, Position, []).
extract_subtree([], _Size, _Position, Acc) ->
lists:reverse(Acc);
extract_subtree([H|T], Size, Position, Acc) ->
Segment = lists:sublist(H, Position, Size),
extract_subtree(T, Size - 1, Position, [Segment|Acc]).
%%------------------------------------------
%%
%% Test
%% using the same example as
%% http://en.wikipedia.org/wiki/CYK_algorithm
%%
%%------------------------------------------
test_analyze() ->
init_storage(),
import_grammar_file("grammar.txt"),
analyze("she eats a fish with a fork").
The grammar file (grammar.txt)
S -> NP VP
VP -> VP PP
VP -> V NP
VP -> eats
PP -> P NP
NP -> Det N
NP -> she
V -> eats
P -> with
N -> fish
N -> fork
Det -> a
The code can be tested from the erlang shell
> c(cyk).
> cyk:test_analyze().
parsing <<"S">> -> <<"NP VP">>
parsing <<"VP">> -> <<"VP PP">>
parsing <<"VP">> -> <<"V NP">>
parsing <<"VP">> -> <<"eats">>
parsing <<"PP">> -> <<"P NP">>
parsing <<"NP">> -> <<"Det N">>
parsing <<"NP">> -> <<"she">>
parsing <<"V">> -> <<"eats">>
parsing <<"P">> -> <<"with">>
parsing <<"N">> -> <<"fish">>
parsing <<"N">> -> <<"fork">>
parsing <<"Det">> -> <<"a">>
Grammar file imported
analysing: "she eats a fish with a fork"
wordlist: [<<"she">>,<<"eats">>,<<"a">>,<<"fish">>,<<"with">>,<<"a">>,
<<"fork">>]
representation: [[<<"NP">>],
[<<"VP">>,<<"V">>],
[<<"Det">>],
[<<"N">>],
[<<"P">>],
[<<"Det">>],
[<<"N">>]]
result: [[[<<"S">>]],
[[],[<<"VP">>]],
[[],[],[]],
[[<<"S">>],[],[],[]],
[[],[<<"VP">>],[],[],[<<"PP">>]],
[[<<"S">>],[],[<<"NP">>],[],[],[<<"NP">>]],
[[<<"NP">>],
[<<"VP">>,<<"V">>],
[<<"Det">>],
[<<"N">>],
[<<"P">>],
[<<"Det">>],
[<<"N">>]]]
The code seems to work fine for this example, but I was looking for ways to improve it (make it more erlang-ish) and specially to make the processing distributed on multiple process/nodes.
I guess all the process_subtree executions for each step could be done concurrent, but I can't really figure how.
Any suggestions will be greatly appreciated!
1条答案
按热度按时间8wigbo561#
我已经编写了这个使用并发执行解决方案。
与Eric的解决方案相比,一些更改是为了多进程的使用,另一些是因为我认为它更有效(我在规则ets中恢复了键和值,并且我选择了一个集合),一些是因为我认为它更干净(我在打开它的函数中关闭了语法文件),一些是因为我更熟悉这些模块(字符串:令牌...)。
我用更快的递归调用替换了无用的spawn,并通过添加消息来同步进程,从而抑制了wait函数。
我是在看CYK算法的Javascript动画时想到这个实现的,不幸的是,CYK算法不再可用。
@Eric,可以用观察器打开ets分析来查看分析的所有步骤,这就是我不删除它的原因。