Locked History Actions

attachment:modify-tei-morphosyntax.pl of Converters4NKJP

Attachment 'modify-tei-morphosyntax.pl'

Download

   1 #!/usr/bin/perl
   2 
   3 
   4 # Converts a "temporary TEI" morphosyntax (output of Anotatornia) to target TEI. xi:include->corresp and other changes.
   5 
   6 # Usage: 
   7 #	perl modify-tei-morphosyntax.pl <input file> <output file>
   8 #
   9 
  10 open INTEXT, "<$ARGV[0]";
  11 open OUTTEXT, ">$ARGV[1]";
  12 
  13 %msds = ();
  14 %ctags = ();
  15 %bases = ();
  16 
  17 while (<INTEXT>) {
  18 
  19 #    print;
  20     
  21     s#(<teiCorpus.*) xmlns:xlink.*>#$1>#;
  22     s/<s xlink:href=(".*?")/<s corresp=$1/;
  23     s/<p xlink:href=(".*?")/<p corresp=$1/;
  24     s/<seg xlink:href=(".*?")/<seg corresp=$1/;
  25     s/<fs type="tool_report">/<fs feats="#an8003" type="tool_report">/;
  26     
  27     # Unlike MSDs, ctags and bases have no ID letting us know from which lex they come from. We need to keep track of lex IDs separately.
  28     if (/<fs type="lex" xml:id="morph_(.*)-lex">/) {
  29 	$curr_lex = $1;
  30 #	print "curr_lex=$curr_lex\n";
  31 	$curr_lex_had_empty_base=0;
  32 	$curr_lex_content = '';
  33     }
  34     
  35     # <symbol> with just one parameter @value is always inside <f name="ctag">; in <f name="msd">, <symbol> has also @xml:id, at least.
  36     if (/<symbol value="(\S*)"\/>/) {
  37 	$ctags{$curr_lex} = $1;
  38 #	print "Got ctag $ctag in lex $curr_lex\n";
  39     }
  40 
  41     if ($curr_lex) {   # if we're in a lex
  42 	if (/<string>(.*?)<\/string>/) {
  43 	    $bases{$curr_lex} = $1;
  44 	    unless ($bases{$curr_lex}) {
  45 #		print "got empty base in lex $curr_lex (orth: \"$orth\")\n";
  46 
  47 # Commenting out: it's too much hassle to catch the difference between manual and automatic lex with an empty base. And we don't want to mess with automatic ones.
  48 #		if ($orth=~/\d+/) {
  49 #		    print "\t...recovering: base=orth\n";		    
  50 #		    $_=~s/<string><\/string>/<string>$orth<\/string>/;
  51 #		}
  52 		
  53 		$curr_lex_had_empty_base=1;		
  54 	    }
  55 	}
  56     }
  57 
  58     if ($catch_orth) {
  59 	if (/<string>(.*?)<\/string>/) {
  60 	    $orth = $1;
  61 	} else {
  62 	    print OUTTEXT "********************************* CATCH ORTH ERROR! *******************************************\n";
  63 	}
  64 	$catch_orth = 0;
  65     }
  66 
  67     
  68     if (/<f name="orth">/) {
  69 	$catch_orth = 1;
  70     }
  71     
  72     
  73 
  74     # <symbol with two or three parameters is always a part of possible interpretations list inside <f name="msd">
  75     # in addition to that, full pointer to a proper xml:id in the disamb section lets us find MSD easily, so all potential values may be kept
  76     # in one msds, addressed by IDs like "morph_1.1.2.3-msd" (note that the ID also encodes the lex, here "1.1.2").
  77     if (/<symbol( nkjp:manual="true")? value="(.*?)" xml:id="(.*?)"\/>/) {
  78 	$msds{$3} = $2;
  79 #	print "Added $3->$2\n";
  80     }
  81     
  82     # using the <symbol> element just to notice that current lex has been entered manually; in such case it has no right to have an empty base
  83     if (/<symbol nkjp:manual="true".*\/>/ && $curr_lex_had_empty_base) {
  84 	print "$ARGV[0]: Manually added lex $curr_lex has an empty base (orth \"$orth\"). ";
  85 	if ($orth=~/\d+/ || $orth eq '-') {
  86 	    print "It's a number or a dash: Correcting automatically\n";
  87 	    $curr_lex_content =~ s/<string><\/string>/<string>$orth<\/string>/;
  88 	} else {
  89 	    print "\n";
  90 	}
  91     }
  92 
  93     
  94     if (/<f fVal/) {
  95 	$cut_annotation_info = 0;
  96     }
  97     
  98 
  99     unless (/^<\?o/ || $cut_annotation_info || $curr_lex) {
 100         print OUTTEXT;
 101     }
 102     
 103     
 104 #    print "Checking curr_lex $curr_lex\n";
 105     if ($curr_lex) {
 106 #	print "adding to curr_lex_content: $_";
 107 	$curr_lex_content = $curr_lex_content.$_;
 108         if (/<\/fs>/) {
 109     	    $curr_lex='';
 110 #    	    print "^^^^".$curr_lex_content."\$\$\$\$\n";
 111     	    print OUTTEXT $curr_lex_content;
 112         }
 113     }
 114     
 115 
 116 
 117 
 118     if (/<fs feats="#an8003"/) {
 119 	$cut_annotation_info = 1;
 120     }
 121 
 122     
 123     if (/<f fVal="#(.*?)" name="choice"\/>/) {
 124 	$key = $1;  # like "morph_1.1.2.1-msd"
 125 	# Now extract part of the above representing lex, in this example "1.1.2"
 126 	$keylex = $key;
 127 	$keylex =~ s/morph_//;
 128 	$keylex =~ s/\.\d+-msd//;
 129 	
 130 #	print "msds{$key}=$msds{$key}\n";
 131 #	print "ctags{$keylex}=$ctags{$keylex}\n";
 132 	
 133 	
 134 	$intrp = "$bases{$keylex}:$ctags{$keylex}:$msds{$key}";
 135 	$intrp=~s/:$//;
 136 #	$intrp=~s/\"/&quot;/g;
 137 	print OUTTEXT "          <f name=\"interpretation\">\n           <string>$intrp<\/string>    <!-- interpretation -->\n          <\/f>\n";
 138 	
 139 	# msds used (proper value printed) => reset it.
 140 	%msds = ();
 141 	%ctags = ();
 142 	%bases = ();
 143 	
 144     }
 145     
 146 }
 147 
 148 close INTEXT;
 149 close OUTTEXT;

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2014-12-29 14:20:43, 4.2 KB) [[attachment:modify-tei-morphosyntax.pl]]
  • [get | view] (2014-12-29 14:20:43, 1.1 KB) [[attachment:modify-tei-segmentation.pl]]
  • [get | view] (2014-12-29 14:20:43, 0.7 KB) [[attachment:modify-tei-senses.pl]]
 All files | Selected Files: delete move to page

You are not allowed to attach a file to this page.