% parse.pl

% finding statements of the form X=Y
% within a complex string

:- [format].

demo :- tell('parse.out'), ignore(demo1), told.

demo1 :-
	chars(g1,Chars),             % seperate file I/O...
	tokens(Tokens,Chars,[]),     % .. from tokenisation
	parse(Statements,Tokens,[]), % .. from parsing
	!,  % <--- block backtracking- otherwise DCGs
            % can go awol.
	format('\n--| chars |--\n ~20L\tetc.,\n\n',[Chars]),	
	format('\n--| tokens |--\n ~L',           [Tokens]),
	format('\n--| statements |--\n ~L',   [Statements]).

        % note strange convention of DCGs:
        % inputs string is second last arg
        % outputs are the other args

% file i/o
chars(F,List) :- see(F), chars1(List), !, seen.
chars(_,[])   :- seen. % catch predicate if chars1 fails.
                       % never leave files open!

chars1(L) :-
	get0(X), % <-- primitive character reader
	chars2(X,L).

chars2(-1,[]) :- !.
chars2(H, [H|T]) :-
	get0(Next),
	chars2(Next,T).

/* understanding text strings is really three problems:
 1) tokenization - 
 2) parsing
 3) interpretation

Tokenization:

The individual atomic expressions of a language
are (usually) written down as sequences of simpler
characters. The text is ultimately then a
character stream. The task of a tokenizer is at
least to segment this character sequence turning
it into a token stream. The tokens (the fancy word
in this context for 'word' or 'atomic expression')
are often also assigned a category. We will support
two categories: $X will denote special words and X
(without a "$") will denote everything else.
*/

% note that we can "eat" more that one character in a leaf dcg
special(digraph) --> "digraph".
special(equal)   --> "=".
special(comma)   --> ",".
special(end)     --> ";".
special(quote)   --> [34]. % 34= "
% curly brackets
special(copen)   --> "{".
special(cclose)  --> "}".
% round brackets
special(ropen)   --> "(".
special(rclose)  --> ")".
% square brackets
special(sopen)   --> "[".
special(sclose)  --> "]".

/*
Once tokenized, the input stream is smaller, more abstract,
makes writing and debugging a parser easier.
*/

%-------------------------------------
% parser- assuming file written
parse(S)  --> statements(S).

% zero or more statements,
% and sometimes we collect information
% from them
statements([]) --> [].
statements([One|Rest]) -->
	[graph, $sopen], % statements beginning
	                 % with keyword "graph",
	                 % we collect
	assignment(One),
	[$sclose,$end],
	statements(Rest).
statements(Rest) -->     % all other statements
	                 % are dull and we will
	                 % ignore them
	dull,
	[$end],
	statements(Rest).

assignment(X=Y) --> [X,$equal], values(Y).

% values are either quotes lists...
values(Y) --> [$quote], items(Y), [$quote].
% ... or single items
values(Y) --> [Y].

% items are connect to other items via commas
items([One|Rest]) --> [One,$comma],items(Rest).
items([One])      --> [One].

% dull stuff- use carefully.
dull --> [].
dull --> [_], dull.

%-------------------------------------
% here's a tokenizer for graphviz files.
tokens(T) -->
	header,
	tokens1(T),  % ignore header and footer
	footer.
	
header -->
	whites,      % skip over whitespace
	"digraph",   % read, and ignore, a keyword
	whites,
	"G",
	whites,
	"{".

footer -->
	whites,      
	"}",
	whites.

% program body holds zero or more tokens
tokens1([]) --> [].
tokens1([One|Rest]) --> token(One), tokens1(Rest).

% tokens may have leading white space
token(X) --> whites, token1(X).

% two token types:
% type 1: specials (denoted with a leading "$"
token1($X) --> special(X).

% type 2: everything else
% note the use of predicate ordering to control
% the parse- hence, no backtracking please outside
% the call to a DCG parser.
token1(X)-->  blacks(X). % reads all non-white,
                          % non-special characaters

% spin down the input string to the first
% non-white space thing.

% note: good black to write a "comment skipper"
whites([],[]) :- !.
whites([H|T],Out) :-
	white(H),
	!,
	whites(T,Out).
whites(L,L).

white(H) :- space([H],[]).

space      --> " " | tabb| newline.
tabb       --> [9].
newline    --> [10].

% spin down the input string to the first
% non-white space or non-special thing.
blacks(X,L0,L) :-
	blacks1(Y,L0,L),
	name(X,Y). %<-- insert "num" here

blacks1([],[],[]) :- !.
blacks1([],L,L) :-
	% tricky stuff- if the NEXT thing
	% is a special, stop here and
	% return the list, unchanged
	special(_,L,_),!.
blacks1([H|Blacks],[H|T],Rest) :-
	\+ white(H),
	!,
	blacks1(Blacks,T,Rest).
blacks1([],L,L).

% phew- thank heavens that we don't need to do
% this everytime we write a DSL- for languages
% we can define with infix,prefix, postfix operators,
% prolog's "read" predicate does all this for us.