Attachment 'modify-tei-morphosyntax.pl'
Download 1 #!/usr/bin/perl
2
3
4 # Converts a "temporary TEI" morphosyntax (output of Anotatornia) to target TEI. xi:include->corresp and other changes.
5
6 # Usage:
7 # perl modify-tei-morphosyntax.pl <input file> <output file>
8 #
9
10 open INTEXT, "<$ARGV[0]";
11 open OUTTEXT, ">$ARGV[1]";
12
13 %msds = ();
14 %ctags = ();
15 %bases = ();
16
17 while (<INTEXT>) {
18
19 # print;
20
21 s#(<teiCorpus.*) xmlns:xlink.*>#$1>#;
22 s/<s xlink:href=(".*?")/<s corresp=$1/;
23 s/<p xlink:href=(".*?")/<p corresp=$1/;
24 s/<seg xlink:href=(".*?")/<seg corresp=$1/;
25 s/<fs type="tool_report">/<fs feats="#an8003" type="tool_report">/;
26
27 # Unlike MSDs, ctags and bases have no ID letting us know from which lex they come from. We need to keep track of lex IDs separately.
28 if (/<fs type="lex" xml:id="morph_(.*)-lex">/) {
29 $curr_lex = $1;
30 # print "curr_lex=$curr_lex\n";
31 $curr_lex_had_empty_base=0;
32 $curr_lex_content = '';
33 }
34
35 # <symbol> with just one parameter @value is always inside <f name="ctag">; in <f name="msd">, <symbol> has also @xml:id, at least.
36 if (/<symbol value="(\S*)"\/>/) {
37 $ctags{$curr_lex} = $1;
38 # print "Got ctag $ctag in lex $curr_lex\n";
39 }
40
41 if ($curr_lex) { # if we're in a lex
42 if (/<string>(.*?)<\/string>/) {
43 $bases{$curr_lex} = $1;
44 unless ($bases{$curr_lex}) {
45 # print "got empty base in lex $curr_lex (orth: \"$orth\")\n";
46
47 # Commenting out: it's too much hassle to catch the difference between manual and automatic lex with an empty base. And we don't want to mess with automatic ones.
48 # if ($orth=~/\d+/) {
49 # print "\t...recovering: base=orth\n";
50 # $_=~s/<string><\/string>/<string>$orth<\/string>/;
51 # }
52
53 $curr_lex_had_empty_base=1;
54 }
55 }
56 }
57
58 if ($catch_orth) {
59 if (/<string>(.*?)<\/string>/) {
60 $orth = $1;
61 } else {
62 print OUTTEXT "********************************* CATCH ORTH ERROR! *******************************************\n";
63 }
64 $catch_orth = 0;
65 }
66
67
68 if (/<f name="orth">/) {
69 $catch_orth = 1;
70 }
71
72
73
74 # <symbol with two or three parameters is always a part of possible interpretations list inside <f name="msd">
75 # in addition to that, full pointer to a proper xml:id in the disamb section lets us find MSD easily, so all potential values may be kept
76 # in one msds, addressed by IDs like "morph_1.1.2.3-msd" (note that the ID also encodes the lex, here "1.1.2").
77 if (/<symbol( nkjp:manual="true")? value="(.*?)" xml:id="(.*?)"\/>/) {
78 $msds{$3} = $2;
79 # print "Added $3->$2\n";
80 }
81
82 # using the <symbol> element just to notice that current lex has been entered manually; in such case it has no right to have an empty base
83 if (/<symbol nkjp:manual="true".*\/>/ && $curr_lex_had_empty_base) {
84 print "$ARGV[0]: Manually added lex $curr_lex has an empty base (orth \"$orth\"). ";
85 if ($orth=~/\d+/ || $orth eq '-') {
86 print "It's a number or a dash: Correcting automatically\n";
87 $curr_lex_content =~ s/<string><\/string>/<string>$orth<\/string>/;
88 } else {
89 print "\n";
90 }
91 }
92
93
94 if (/<f fVal/) {
95 $cut_annotation_info = 0;
96 }
97
98
99 unless (/^<\?o/ || $cut_annotation_info || $curr_lex) {
100 print OUTTEXT;
101 }
102
103
104 # print "Checking curr_lex $curr_lex\n";
105 if ($curr_lex) {
106 # print "adding to curr_lex_content: $_";
107 $curr_lex_content = $curr_lex_content.$_;
108 if (/<\/fs>/) {
109 $curr_lex='';
110 # print "^^^^".$curr_lex_content."\$\$\$\$\n";
111 print OUTTEXT $curr_lex_content;
112 }
113 }
114
115
116
117
118 if (/<fs feats="#an8003"/) {
119 $cut_annotation_info = 1;
120 }
121
122
123 if (/<f fVal="#(.*?)" name="choice"\/>/) {
124 $key = $1; # like "morph_1.1.2.1-msd"
125 # Now extract part of the above representing lex, in this example "1.1.2"
126 $keylex = $key;
127 $keylex =~ s/morph_//;
128 $keylex =~ s/\.\d+-msd//;
129
130 # print "msds{$key}=$msds{$key}\n";
131 # print "ctags{$keylex}=$ctags{$keylex}\n";
132
133
134 $intrp = "$bases{$keylex}:$ctags{$keylex}:$msds{$key}";
135 $intrp=~s/:$//;
136 # $intrp=~s/\"/"/g;
137 print OUTTEXT " <f name=\"interpretation\">\n <string>$intrp<\/string> <!-- interpretation -->\n <\/f>\n";
138
139 # msds used (proper value printed) => reset it.
140 %msds = ();
141 %ctags = ();
142 %bases = ();
143
144 }
145
146 }
147
148 close INTEXT;
149 close OUTTEXT;
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.