#! /usr/bin/env python
# 
#  The Termprocessor Kimwitu++
#
#  Copyright (C) 1998-2001 Humboldt-University of Berlin, Institute of Informatics
#  All rights reserved.
#  
# Kimwitu++ is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Kimwitu++ is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Kimwitu++; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#  
"""
This program parses the verbose bison output in order to extract the states
and their corresponding rules. For example, the snippet
state 258

    parameter_dcls  ->  '(' param_dcl_list . ')'   (rule 168)
    param_dcl_list  ->  param_dcl_list . ',' param_dcl   (rule 171)
is translated to a C array that has, at index 258, the string
"parameter_dcls  ->  '(' param_dcl_list . ')'\n"
"param_dcl_list  ->  param_dcl_list . ',' param_dcl\n"
This array, in turn, can be used for error reporting in the generated
bison parser.

With recent bison versions, a more compact representation can be
found. Using the arrays yyrhs and yyr1, the complete text of a rule
can be reproduced. The only missing information is the association
between (state, (rule number, position of the dot)). If invoked with
--compact, this script generates two arrays

int error_rules[][2], error_state[];

error_state gives, for a state, the index into error_rules, which,
starting at that index and terminated by {0,0} provides all rule
numbers and dot positions.

This program is invoked with the name of the bison output foo.output.
It then generates a file foo-output.c

The following assumptions are made about the input:
- the individual state listings start at column 0 with "state"
- the rules inside a state have a "->" and end with (rule number)
"""

import sys,string,os,re

def usage():
    print "Usage:%s [--compact] Y.OUTPUT ERRMSG.C" % sys.argv[0]
    sys.exit(1)


args = sys.argv[1:]

compact = 0
if args[0] == "--compact":
    del args[0]
    compact = 1

if len(args)!=2:
    print "too many arguments", args
    usage()

base,ext=os.path.splitext(args[0])
if ext!=".output":
    print args[0],"does not end in .output"
    usage()

states={}
compact_states = {}
curstate=None

uses_escapes=0

for l in open(args[0],"rt").readlines():
    if l[:6]=="state " and l[6] in string.digits:
        curstate = int(l[6:])
        states[curstate]=[]
        compact_states[curstate]=[]
        continue
    if curstate is None:
        # we have not seen state 0, yet
        continue
    if l.find("->") == -1:
        # inside a state, but this line is not a rule
        continue
    if l.find("error")!=-1:
        # this is an error stabilization rule
        continue
    # Now we have a rule. Split off the rule
    l, ruleno = l.split("(rule ")
    l = l.strip()
    assert ruleno.endswith(")\n")
    ruleno = int(ruleno[:-2])
    # the rule might contain funny characters. Not all are supported, here.
    if not uses_escapes:
	# Recent bison versions escape those characters themselves
	if l.find('\\"')>=0:
	    uses_escapes = 1
	else:
	    l=re.sub('"','\\"',l)
    states[curstate].append(l)
    l = l.split()
    assert l[1] == "->"
    l = l[2:]
    try:
        # A single character ' ' might have been split into two tokens
        pos = l.index("'")
        assert l[pos+1] == "'"
        del l[pos]
    except ValueError:
        pass
    pos = 0
    for token in l:
        if token == ".":
            break
        pos += 1
    compact_states[curstate].append((ruleno, pos))

out=open(args[1],"w")

if compact:
    errstate = []
    pos = 0
    out.write("int const error_rules[][2]={\n")
    for i in xrange(0, max(compact_states.keys())+1):
        errstate.append(pos)
        for r in compact_states.get(i, [])+[(0,0)]:
            out.write(" {%4d, %2d}," % r)
            if pos % 6 == 5:
                out.write("\n")
            pos += 1
    out.write("{0,0}\n};\n\n")
    out.write("int error_state[]={\n")
    for i in range(len(errstate)):
        out.write("%5d," % errstate[i])
        if i % 8 == 7:
            out.write("\n")
    out.write(" 0\n};\n")
    
else:
    out.write("char * const error_message[]={\n")

    for i in xrange(0,max(states.keys())+1):
        if states.has_key(i):
            if len(states[i])>8:
                l="\\t"+string.join(states[i][:8],"\\n\\t")
                l=l+"\\n... more rules ..."
            else:
                l="\\t"+string.join(states[i],"\\n\\t")
                assert(len(l)<2048) #MSVC limitation
                out.write('"%s",\n' % l)
        else:
            out.write('"",\n');

    out.write("0};\n");

out.close()
    
