loncom/build/xfml_parse.pl - view

File: [LON-CAPA] / loncom / build / xfml_parse.pl
Revision 1.2: download - view: text, annotated - select for diffs
Fri Feb 1 10:56:41 2002 UTC (22 years, 5 months ago) by harris41
Branches: MAIN
CVS tags: stable_2002_spring, HEAD

a lot of cleaning up, debugging, and commenting

1: #!/usr/bin/perl 2: 3: # YEAR=2002 4: # 1/26,1/27,1/28,1/29,1/30,1/31 - Scott Harrison 5: # 6: ### 7: 8: # Read in 2 XML file; first is the filter specification, the second 9: # is the XML file to be filtered 10: 11: ############################################################################### 12: ## ## 13: ## ORGANIZATION OF THIS PERL SCRIPT ## 14: ## 1. Notes ## 15: ## 2. Get command line arguments ## 16: ## 3. First pass through (grab distribution-specific information) ## 17: ## 4. Second pass through (parse out what is not necessary) ## 18: ## 5. Third pass through (translate markup according to specified mode) ## 19: ## 6. Functions (most all just format contents of different markup tags) ## 20: ## 7. POD (plain old documentation, CPAN style) ## 21: ## ## 22: ############################################################################### 23: 24: # ----------------------------------------------------------------------- Notes 25: # 26: # I am using a multiple pass-through approach to parsing 27: # the xfml file. This saves memory and makes sure the server 28: # will never be overloaded. 29: # 30: # This is meant to parse files meeting the piml document type. 31: # See xfml.dtd. XFML=XML Filtering Markup Language. 32: 33: use HTML::TokeParser; 34: use strict; 35: 36: unless (@ARGV) { 37: print <<END; 38: Incorrect invocation. 39: Example usages: 40: cat loncapafiles.lpml | perl xfml_parse.pl valid_hosts.xfml 41: perl xfml_parse.pl valid_hosts.xfml loncapafiles.lpml 42: END 43: } 44: 45: my %eh; 46: my %ih; 47: my $tofilter=shift @ARGV; 48: open IN,"<$tofilter"; 49: my @lines=<IN>; my $parsestring=join('',@lines); undef @lines; 50: close IN; 51: my $parser = HTML::TokeParser->new(\$parsestring) or 52: die('can\'t create TokeParser object'); 53: $parser->xml_mode('1'); 54: 55: # Define handling methods for mode-dependent text rendering 56: 57: my %conditions; &cc; 58: 59: $parser->{textify}={ 60: xfml => \&format_xfml, 61: 'when:name' => \&format_when_name, 62: 'when:attribute' => \&format_when_attribute, 63: 'when:cdata' => \&format_when_cdata, 64: 'choice:include' => \&format_choice_include, 65: 'choice:exclude' => \&format_choice_exclude, 66: }; 67: 68: my $text; 69: my $xfml; 70: my $wloc=0; 71: my %eha; 72: 73: while (my $token = $parser->get_tag('xfml')) { 74: &format_xfml(@{$token}); 75: $text = $parser->get_text('/xfml'); 76: $token = $parser->get_tag('/xfml'); 77: } 78: 79: #open IN,"<$tofilter"; 80: my @lines2=<>; my $parsestring2=join('',@lines2); undef @lines2; 81: $parser = HTML::TokeParser->new(\$parsestring2) or 82: die('can\'t create TokeParser object'); 83: $parser->xml_mode('1'); 84: 85: my $token; 86: my $hloc=0; 87: my %ts; 88: my $tr; 89: my $echild=0; 90: my $exclude=0; 91: my $excluden=0; 92: my $excludea=0; 93: my $et=0; 94: my $cdata=''; 95: my $excludenold=0; 96: my $ign=0; 97: 98: while ($token = $parser->get_token()) { 99: if ($token->[0] eq 'D') { 100: print $token->[1]; 101: } 102: elsif ($token->[0] eq 'C') { 103: print $token->[1]; 104: } 105: elsif ($token->[0] eq 'S') { 106: $cdata=''; 107: $hloc++; 108: # if token can be excluded, then pretend it is until all conditions are 109: # run (eha); then output during end tag processing 110: # else, output 111: 112: # a token can be excluded when it is an eh key, or a child node of 113: # an eh key 114: 115: if ($eh{$token->[1]}) { 116: $echild=$token->[1]; 117: } 118: if ($echild) { 119: # run through names for echild 120: # then attributes and/or values and/or cdata 121: my $name=$token->[1]; 122: my @attributes=@{$token->[3]}; 123: my %atthash=%{$token->[2]}; 124: foreach my $namemlist (@{$eha{$echild}->{'name'}}) { 125: foreach my $namematch (@{$namemlist}) { 126: my $nm=$namematch; $nm=~s/^.//; $nm=~s/.$//; 127: if ($name=~/$nm/) { 128: $excludenold=$excluden; 129: $excluden++; 130: foreach my $attributemlist 131: (@{$eha{$echild}->{'attribute'}}) { 132: foreach my $attributematch 133: (@{$attributemlist}) { 134: my ($an,$am)= 135: split(/\=/,$attributematch,2); 136: $am=~s/^.//; 137: $am=~s/.$//; 138: if ($atthash{$an}) { 139: if ($atthash{$an}=~/$am/) { 140: $excludea++; 141: } 142: } 143: } 144: } 145: } 146: } 147: } 148: $tr.=$token->[4]; 149: } 150: else { 151: print $token->[4]; 152: } 153: } 154: elsif ($token->[0] eq 'E') { 155: if ($echild) { 156: $tr.=$token->[2]; 157: if ($excluden) { 158: my $i=0; 159: CDATALOOP: 160: foreach my $cdatamlist (@{$eha{$echild}->{'cdata'}}) { 161: $i++; 162: my $j; 163: foreach my $cdatamatch (@{$cdatamlist}) { 164: $j++; 165: # print "CDATA: $cdatamatch, $cdata\n"; 166: my $cm=$cdatamatch; 167: my $not=0; 168: if ($cm=~/\!/) { 169: $not=1; 170: $cm=~s/^.//; 171: } 172: $cm=~s/^.//; $cm=~s/.$//; 173: if ($not and $cdata=~/$cm/) { 174: $ign=1; $exclude=0; 175: } 176: if ((!$not and $cdata!~/$cm/) 177: or ($not and $cdata=~/$cm/)) { 178: # nothing happens 179: # $exclude=0; 180: } 181: elsif (($not and $cdata!~/$cm/) 182: or (!$not and $cdata=~/$cm/)) { 183: $exclude++ unless $ign; 184: } 185: } 186: } 187: } 188: } 189: if ($eh{$token->[1]}) { 190: $ign=0; 191: $echild=0; 192: if (!$exclude and !$excludea) { 193: print $tr; 194: # print $token->[2]; 195: $tr=''; 196: } 197: elsif ($exclude>0 or $excludea>0) { 198: # print "EXCLUDING $token->[1] $exclude $excludea $excluden\n"; 199: $exclude=0; $excluden=0; $excludea=0; 200: $tr=''; 201: } 202: $exclude=0; $excluden=0; $excludea=0; 203: } 204: else { 205: if ($echild) { 206: # $tr.=$token->[2]; 207: } 208: else { 209: print $token->[2]; 210: $tr=''; 211: } 212: } 213: $hloc--; 214: } 215: elsif ($token->[0] eq 'T') { 216: if ($echild) { 217: $tr.=$token->[1]; 218: $cdata=$token->[1]; 219: } 220: else { 221: print $token->[1]; 222: $tr=''; 223: } 224: } 225: } 226: 227: # ------------------------------------------------------------ clear conditions 228: sub cc { 229: @{$conditions{'name'}}=(); pop @{$conditions{'name'}}; 230: @{$conditions{'attribute'}}=(); pop @{$conditions{'attribute'}}; 231: @{$conditions{'value'}}=(); pop @{$conditions{'value'}}; 232: @{$conditions{'cdata'}}=(); pop @{$conditions{'cdata'}}; 233: } 234: 235: # --------------------------------------- remove starting and ending whitespace 236: sub trim { 237: my ($s)=@_; $s=~s/^\s*//; $s=~s/\s*$//; return $s; 238: } 239: 240: 241: 242: # --------------------------------------------------------- Format xfml section 243: sub format_xfml { 244: my (@tokeninfo)=@_; 245: return ''; 246: } 247: 248: # ---------------------------------------------------- Format when:name section 249: sub format_when_name { 250: my (@tokeninfo)=@_; 251: $wloc++; 252: my $att_match=$tokeninfo[2]->{'match'}; 253: push @{$conditions{'name'}},$att_match; 254: my $text=&trim($parser->get_text('/when:name')); 255: $parser->get_tag('/when:name'); 256: $wloc--; 257: &cc unless $wloc; 258: return ''; 259: } 260: 261: # ----------------------------------------------- Format when:attribute section 262: sub format_when_attribute { 263: my (@tokeninfo)=@_; 264: $wloc++; 265: my $att_match=$tokeninfo[2]->{'match'}; 266: push @{$conditions{'attribute'}},$att_match; 267: my $text=&trim($parser->get_text('/when:attribute')); 268: $parser->get_tag('/when:attribute'); 269: $wloc--; 270: &cc unless $wloc; 271: return ''; 272: } 273: 274: # --------------------------------------------------- Format when:cdata section 275: sub format_when_cdata { 276: my (@tokeninfo)=@_; 277: $wloc++; 278: my $att_match=$tokeninfo[2]->{'match'}; 279: push @{$conditions{'cdata'}},$att_match; 280: my $text=&trim($parser->get_text('/when:cdata')); 281: $parser->get_tag('/when:cdata'); 282: $wloc--; 283: &cc unless $wloc; 284: return ''; 285: } 286: 287: # ----------------------------------------------- Format choice:include section 288: sub format_choice_include { 289: my (@tokeninfo)=@_; 290: my $text=&trim($parser->get_text('/choice:include')); 291: $parser->get_tag('/choice:include'); 292: $ih{$tokeninfo[2]->{'match'}}++; 293: return ''; 294: } 295: 296: # ----------------------------------------------- Format choice:exclude section 297: sub format_choice_exclude { 298: my (@tokeninfo)=@_; 299: my $text=&trim($parser->get_text('/choice:exclude')); 300: $parser->get_tag('/choice:exclude'); 301: $eh{$tokeninfo[2]->{'nodename'}}++; 302: push @{$eha{$tokeninfo[2]->{'nodename'}}->{'name'}}, 303: [@{$conditions{'name'}}]; 304: push @{$eha{$tokeninfo[2]->{'nodename'}}->{'attribute'}}, 305: [@{$conditions{'attribute'}}]; 306: push @{$eha{$tokeninfo[2]->{'nodename'}}->{'value'}}, 307: [@{$conditions{'value'}}]; 308: push @{$eha{$tokeninfo[2]->{'nodename'}}->{'cdata'}}, 309: [@{$conditions{'cdata'}}]; 310: return ''; 311: }