--- loncom/localize/localize/checksimilar_2files.pl 2010/03/09 15:16:26 1.1 +++ loncom/localize/localize/checksimilar_2files.pl 2013/12/20 14:27:54 1.8 @@ -1,6 +1,6 @@ #!/usr/bin/perl # The LearningOnline Network with CAPA -# $Id: checksimilar_2files.pl,v 1.1 2010/03/09 15:16:26 wenzelju Exp $ +# $Id: checksimilar_2files.pl,v 1.8 2013/12/20 14:27:54 bisitz Exp $ use strict; use warnings; @@ -20,7 +20,7 @@ use open ':utf8'; sub read { # Read file into memory my $file = shift; - open(IN,$file) or die; + open(IN,$file) or die "Error: Could not open file: $file\n"; my %filecontent = (); my $contents = join('',); close(IN); @@ -28,36 +28,45 @@ sub read { my %Lexicon = (); eval($contents.'; %filecontent=%Lexicon;'); if ($@ ne "") { - print "\nAn error occurred during the attempt to retrieve the translation hash.\n" + die "\nAn error occurred during the attempt to retrieve the translation hash.\n" ."Error: ".$@."\n"; - die; } return %filecontent; } -sub similarities{ +sub similar_chars { my $text = shift; - $text =~ s/[.,\_\-?!:]//g; + $text =~ s/<\/*\w+ *\/*>//g; # HTML tags + $text =~ s/\[_\d\]//g; # translation parameters + $text =~ s/[.,\_\-?!: \/\(\)]//g; # punctuation return $text; } -sub CourseCommunity { +sub similar_phrases { my $text1 = shift; my $text2 = shift; - $text1 =~ s/courses//gi; - $text1 =~ s/communities//gi; - $text1 =~ s/course//gi; - $text1 =~ s/community//gi; - $text2 =~ s/courses//gi; - $text2 =~ s/communities//gi; - $text2 =~ s/course//gi; - $text2 =~ s/community//gi; + my %phrases = ( + 'courses' => 1, + 'communities' => 1, + 'course' => 2, + 'community' => 2, + 'member' => 3, + 'student' => 3, + 'students' => 3, + 'construction'=> 4, + 'authoring' => 4, + ); + + foreach my $word (keys %phrases) { + $text1 =~ s/$word/X$phrases{$word}X/gi; + $text2 =~ s/$word/X$phrases{$word}X/gi; + } - if(lc($text1) eq lc($text2)) { + if (lc($text1) eq lc($text2)) { return 1; } @@ -68,27 +77,32 @@ sub CourseCommunity { ####--------Main Program--------#### +if (!$ARGV[0] or !$ARGV[1]) { + die 'Error: Invalid files! Please specify two files which should be checked.'."\n"; +} + my $file1 = $ARGV[0]; # Old language.pm my $file2 = $ARGV[1]; # New Phrases + +print("Checking for similar expressions in phrases in $file1 and $file2...\n"); + my %langOLD = &read($file1); #Hash with old phrases my %langNEW = &read($file2); #Hash with new phrases my $dlm; -my $count = 1; #Counter - -open(OUT,'>similarities.txt') or die; +my $count = 0; # For each new phrase, check if there is already a similar one while( my ($kNEW, $vNEW) = each %langNEW ) { my $temp1 = $kNEW; - $temp1 = &similarities($temp1); + $temp1 = &similar_chars($temp1); while( my ($kOLD, $vOLD) = each %langOLD ) { my $temp2 = $kOLD; - $temp2 = &similarities($temp2); + $temp2 = &similar_chars($temp2); #Check for similar punctuation (case insensitive) or - #similarity related to Course/Community - if(lc($temp1) eq lc($temp2) || &CourseCommunity($temp1,$temp2)){ + #similarity related to similar phrases + if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) { #Find delimiter for key and value if (($kNEW=~/\'/) & ($kNEW=~/\"/)) { print " (Warning: Both, ' and \", occur!)"; @@ -98,8 +112,8 @@ while( my ($kNEW, $vNEW) = each %langNEW } else { $dlm = "'"; } - print OUT (< $dlm$vOLD$dlm,