#!/usr/bin/perl


# Converts a "temporary TEI" morphosyntax (output of Anotatornia) to target TEI. xi:include->corresp and other changes.

# Usage: 
#	perl modify-tei-morphosyntax.pl <input file> <output file>
#

open INTEXT, "<$ARGV[0]";
open OUTTEXT, ">$ARGV[1]";

%msds = ();
%ctags = ();
%bases = ();

while (<INTEXT>) {

#    print;
    
    s#(<teiCorpus.*) xmlns:xlink.*>#$1>#;
    s/<s xlink:href=(".*?")/<s corresp=$1/;
    s/<p xlink:href=(".*?")/<p corresp=$1/;
    s/<seg xlink:href=(".*?")/<seg corresp=$1/;
    s/<fs type="tool_report">/<fs feats="#an8003" type="tool_report">/;
    
    # Unlike MSDs, ctags and bases have no ID letting us know from which lex they come from. We need to keep track of lex IDs separately.
    if (/<fs type="lex" xml:id="morph_(.*)-lex">/) {
	$curr_lex = $1;
#	print "curr_lex=$curr_lex\n";
	$curr_lex_had_empty_base=0;
	$curr_lex_content = '';
    }
    
    # <symbol> with just one parameter @value is always inside <f name="ctag">; in <f name="msd">, <symbol> has also @xml:id, at least.
    if (/<symbol value="(\S*)"\/>/) {
	$ctags{$curr_lex} = $1;
#	print "Got ctag $ctag in lex $curr_lex\n";
    }

    if ($curr_lex) {   # if we're in a lex
	if (/<string>(.*?)<\/string>/) {
	    $bases{$curr_lex} = $1;
	    unless ($bases{$curr_lex}) {
#		print "got empty base in lex $curr_lex (orth: \"$orth\")\n";

# Commenting out: it's too much hassle to catch the difference between manual and automatic lex with an empty base. And we don't want to mess with automatic ones.
#		if ($orth=~/\d+/) {
#		    print "\t...recovering: base=orth\n";		    
#		    $_=~s/<string><\/string>/<string>$orth<\/string>/;
#		}
		
		$curr_lex_had_empty_base=1;		
	    }
	}
    }

    if ($catch_orth) {
	if (/<string>(.*?)<\/string>/) {
	    $orth = $1;
	} else {
	    print OUTTEXT "********************************* CATCH ORTH ERROR! *******************************************\n";
	}
	$catch_orth = 0;
    }

    
    if (/<f name="orth">/) {
	$catch_orth = 1;
    }
    
    

    # <symbol with two or three parameters is always a part of possible interpretations list inside <f name="msd">
    # in addition to that, full pointer to a proper xml:id in the disamb section lets us find MSD easily, so all potential values may be kept
    # in one msds, addressed by IDs like "morph_1.1.2.3-msd" (note that the ID also encodes the lex, here "1.1.2").
    if (/<symbol( nkjp:manual="true")? value="(.*?)" xml:id="(.*?)"\/>/) {
	$msds{$3} = $2;
#	print "Added $3->$2\n";
    }
    
    # using the <symbol> element just to notice that current lex has been entered manually; in such case it has no right to have an empty base
    if (/<symbol nkjp:manual="true".*\/>/ && $curr_lex_had_empty_base) {
	print "$ARGV[0]: Manually added lex $curr_lex has an empty base (orth \"$orth\"). ";
	if ($orth=~/\d+/ || $orth eq '-') {
	    print "It's a number or a dash: Correcting automatically\n";
	    $curr_lex_content =~ s/<string><\/string>/<string>$orth<\/string>/;
	} else {
	    print "\n";
	}
    }

    
    if (/<f fVal/) {
	$cut_annotation_info = 0;
    }
    

    unless (/^<\?o/ || $cut_annotation_info || $curr_lex) {
        print OUTTEXT;
    }
    
    
#    print "Checking curr_lex $curr_lex\n";
    if ($curr_lex) {
#	print "adding to curr_lex_content: $_";
	$curr_lex_content = $curr_lex_content.$_;
        if (/<\/fs>/) {
    	    $curr_lex='';
#    	    print "^^^^".$curr_lex_content."\$\$\$\$\n";
    	    print OUTTEXT $curr_lex_content;
        }
    }
    



    if (/<fs feats="#an8003"/) {
	$cut_annotation_info = 1;
    }

    
    if (/<f fVal="#(.*?)" name="choice"\/>/) {
	$key = $1;  # like "morph_1.1.2.1-msd"
	# Now extract part of the above representing lex, in this example "1.1.2"
	$keylex = $key;
	$keylex =~ s/morph_//;
	$keylex =~ s/\.\d+-msd//;
	
#	print "msds{$key}=$msds{$key}\n";
#	print "ctags{$keylex}=$ctags{$keylex}\n";
	
	
	$intrp = "$bases{$keylex}:$ctags{$keylex}:$msds{$key}";
	$intrp=~s/:$//;
#	$intrp=~s/\"/&quot;/g;
	print OUTTEXT "          <f name=\"interpretation\">\n           <string>$intrp<\/string>    <!-- interpretation -->\n          <\/f>\n";
	
	# msds used (proper value printed) => reset it.
	%msds = ();
	%ctags = ();
	%bases = ();
	
    }
    
}

close INTEXT;
close OUTTEXT;

