--- loncom/build/xfml_parse.pl 2002/01/29 10:42:42 1.1 +++ loncom/build/xfml_parse.pl 2002/02/01 10:56:41 1.2 @@ -1,11 +1,35 @@ #!/usr/bin/perl # YEAR=2002 -# 1/26,1/27,1/28 - Scott Harrison +# 1/26,1/27,1/28,1/29,1/30,1/31 - Scott Harrison +# +### # Read in 2 XML file; first is the filter specification, the second # is the XML file to be filtered +############################################################################### +## ## +## ORGANIZATION OF THIS PERL SCRIPT ## +## 1. Notes ## +## 2. Get command line arguments ## +## 3. First pass through (grab distribution-specific information) ## +## 4. Second pass through (parse out what is not necessary) ## +## 5. Third pass through (translate markup according to specified mode) ## +## 6. Functions (most all just format contents of different markup tags) ## +## 7. POD (plain old documentation, CPAN style) ## +## ## +############################################################################### + +# ----------------------------------------------------------------------- Notes +# +# I am using a multiple pass-through approach to parsing +# the xfml file. This saves memory and makes sure the server +# will never be overloaded. +# +# This is meant to parse files meeting the piml document type. +# See xfml.dtd. XFML=XML Filtering Markup Language. + use HTML::TokeParser; use strict; @@ -21,7 +45,9 @@ END my %eh; my %ih; my $tofilter=shift @ARGV; -my @lines=<>; my $parsestring=join('',@lines); undef @lines; +open IN,"<$tofilter"; +my @lines=; my $parsestring=join('',@lines); undef @lines; +close IN; my $parser = HTML::TokeParser->new(\$parsestring) or die('can\'t create TokeParser object'); $parser->xml_mode('1'); @@ -47,13 +73,11 @@ my %eha; while (my $token = $parser->get_tag('xfml')) { &format_xfml(@{$token}); $text = $parser->get_text('/xfml'); -# print $xfml; -# print $text; $token = $parser->get_tag('/xfml'); } -open IN,"<$tofilter"; -my @lines2=; close IN; my $parsestring2=join('',@lines2); undef @lines2; +#open IN,"<$tofilter"; +my @lines2=<>; my $parsestring2=join('',@lines2); undef @lines2; $parser = HTML::TokeParser->new(\$parsestring2) or die('can\'t create TokeParser object'); $parser->xml_mode('1'); @@ -68,22 +92,10 @@ my $excluden=0; my $excludea=0; my $et=0; my $cdata=''; +my $excludenold=0; +my $ign=0; + while ($token = $parser->get_token()) { -# from HTML::TokeParser documentation: -# ["S", $tag, %$attr, @$attrseq, $text] -# ["E", $tag, $text] -# ["T", $text, $is_data] -# ["C", $text] -# ["D", $text] -# ["PI", $token0, $text] -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'name'}}, -# @{$conditions{'name'}}; -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'attribute'}}, -# @{$conditions{'attribute'}}; -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'value'}}, -# @{$conditions{'value'}}; -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'cdata'}}, -# @{$conditions{'cdata'}}; if ($token->[0] eq 'D') { print $token->[1]; } @@ -102,7 +114,6 @@ while ($token = $parser->get_token()) { if ($eh{$token->[1]}) { $echild=$token->[1]; -# print "ECHILD=$echild\n"; } if ($echild) { # run through names for echild @@ -114,7 +125,7 @@ while ($token = $parser->get_token()) { foreach my $namematch (@{$namemlist}) { my $nm=$namematch; $nm=~s/^.//; $nm=~s/.$//; if ($name=~/$nm/) { -# print "NMATCH: $nm ($name)\n"; + $excludenold=$excluden; $excluden++; foreach my $attributemlist (@{$eha{$echild}->{'attribute'}}) { @@ -124,15 +135,9 @@ while ($token = $parser->get_token()) { split(/\=/,$attributematch,2); $am=~s/^.//; $am=~s/.$//; -# print 'AM:'."($an,$am)\t"; -# print 'ATT:'.join(',',%atthash)."\n"; if ($atthash{$an}) { if ($atthash{$an}=~/$am/) { $excludea++; -# print "AMATCH: $am (". -# join(',', -# @attributes) -# ."\n"; } } } @@ -150,8 +155,13 @@ while ($token = $parser->get_token()) { if ($echild) { $tr.=$token->[2]; if ($excluden) { + my $i=0; + CDATALOOP: foreach my $cdatamlist (@{$eha{$echild}->{'cdata'}}) { + $i++; + my $j; foreach my $cdatamatch (@{$cdatamlist}) { + $j++; # print "CDATA: $cdatamatch, $cdata\n"; my $cm=$cdatamatch; my $not=0; @@ -160,19 +170,24 @@ while ($token = $parser->get_token()) { $cm=~s/^.//; } $cm=~s/^.//; $cm=~s/.$//; + if ($not and $cdata=~/$cm/) { + $ign=1; $exclude=0; + } if ((!$not and $cdata!~/$cm/) or ($not and $cdata=~/$cm/)) { -# print "CMISMATCH: $cm ($cdata)\n"; +# nothing happens +# $exclude=0; } elsif (($not and $cdata!~/$cm/) or (!$not and $cdata=~/$cm/)) { - $exclude++; + $exclude++ unless $ign; } } } } } if ($eh{$token->[1]}) { + $ign=0; $echild=0; if (!$exclude and !$excludea) { print $tr; @@ -180,7 +195,7 @@ while ($token = $parser->get_token()) { $tr=''; } elsif ($exclude>0 or $excludea>0) { -# print "EXCLUDING $token->[1] $excludea $excluden\n"; +# print "EXCLUDING $token->[1] $exclude $excludea $excluden\n"; $exclude=0; $excluden=0; $excludea=0; $tr=''; } @@ -222,6 +237,8 @@ sub trim { my ($s)=@_; $s=~s/^\s*//; $s=~s/\s*$//; return $s; } + + # --------------------------------------------------------- Format xfml section sub format_xfml { my (@tokeninfo)=@_; @@ -236,7 +253,6 @@ sub format_when_name { push @{$conditions{'name'}},$att_match; my $text=&trim($parser->get_text('/when:name')); $parser->get_tag('/when:name'); -# print 'Name Matching...'.$att_match; $wloc--; &cc unless $wloc; return ''; @@ -250,7 +266,6 @@ sub format_when_attribute { push @{$conditions{'attribute'}},$att_match; my $text=&trim($parser->get_text('/when:attribute')); $parser->get_tag('/when:attribute'); -# print 'Attribute Matching...'.$att_match; $wloc--; &cc unless $wloc; return ''; @@ -261,7 +276,6 @@ sub format_when_cdata { my (@tokeninfo)=@_; $wloc++; my $att_match=$tokeninfo[2]->{'match'}; -# print 'Cdata Matching...'.$att_match; push @{$conditions{'cdata'}},$att_match; my $text=&trim($parser->get_text('/when:cdata')); $parser->get_tag('/when:cdata');