--- loncom/localize/localize/checksimilar_2files.pl	2010/03/09 15:16:26	1.1
+++ loncom/localize/localize/checksimilar_2files.pl	2013/12/20 14:27:54	1.8
@@ -1,6 +1,6 @@
 #!/usr/bin/perl
 # The LearningOnline Network with CAPA
-# $Id: checksimilar_2files.pl,v 1.1 2010/03/09 15:16:26 wenzelju Exp $
+# $Id: checksimilar_2files.pl,v 1.8 2013/12/20 14:27:54 bisitz Exp $
 
 use strict;
 use warnings;
@@ -20,7 +20,7 @@ use open ':utf8';
 sub read {
     # Read file into memory
     my $file = shift;
-    open(IN,$file) or die;
+    open(IN,$file) or die "Error: Could not open file: $file\n";
     my %filecontent = ();
     my $contents = join('',<IN>);
     close(IN);
@@ -28,36 +28,45 @@ sub read {
     my %Lexicon = ();
     eval($contents.'; %filecontent=%Lexicon;');
     if ($@ ne "") {
-        print "\nAn error occurred during the attempt to retrieve the translation hash.\n"
+        die "\nAn error occurred during the attempt to retrieve the translation hash.\n"
              ."Error: ".$@."\n";
-        die;
     }
     return %filecontent;
 }
 
-sub similarities{
+sub similar_chars {
     my $text = shift;
-    $text =~ s/[.,\_\-?!:]//g;
+    $text =~ s/<\/*\w+ *\/*>//g; # HTML tags
+    $text =~ s/\[_\d\]//g; # translation parameters
+    $text =~ s/[.,\_\-?!: \/\(\)]//g; # punctuation
     return $text;
 }
 
 
 
-sub CourseCommunity {
+sub similar_phrases {
     
     my $text1 = shift;
     my $text2 = shift;
     
-    $text1 =~ s/courses//gi;
-    $text1 =~ s/communities//gi;    
-    $text1 =~ s/course//gi;
-    $text1 =~ s/community//gi;
-    $text2 =~ s/courses//gi;
-    $text2 =~ s/communities//gi;
-    $text2 =~ s/course//gi;
-    $text2 =~ s/community//gi;
+    my %phrases = (
+        'courses'     => 1,
+        'communities' => 1,
+        'course'      => 2,
+        'community'   => 2,
+        'member'      => 3,
+        'student'     => 3,
+        'students'    => 3,
+        'construction'=> 4,
+        'authoring'   => 4,
+    );
+
+    foreach my $word (keys %phrases) {
+        $text1 =~ s/$word/X$phrases{$word}X/gi;
+        $text2 =~ s/$word/X$phrases{$word}X/gi;
+    }
 
-    if(lc($text1) eq lc($text2)) {
+    if (lc($text1) eq lc($text2)) {
         return 1;
     }
     
@@ -68,27 +77,32 @@ sub CourseCommunity {
 
 ####--------Main Program--------####
 
+if (!$ARGV[0] or !$ARGV[1]) {
+    die 'Error: Invalid files! Please specify two files which should be checked.'."\n";
+}
+
 my $file1 = $ARGV[0];  # Old language.pm
 my $file2 = $ARGV[1];  # New Phrases
+
+print("Checking for similar expressions in phrases in $file1 and $file2...\n");
+
 my %langOLD = &read($file1); #Hash with old phrases
 my %langNEW = &read($file2); #Hash with new phrases
 my $dlm; 
-my $count = 1; #Counter
-
-open(OUT,'>similarities.txt') or die;
+my $count = 0;
 
 # For each new phrase, check if there is already a similar one
 while( my ($kNEW, $vNEW) = each %langNEW ) {
     my $temp1 = $kNEW;
-    $temp1 = &similarities($temp1);
+    $temp1 = &similar_chars($temp1);
    
     while( my ($kOLD, $vOLD) = each %langOLD ) {
         my $temp2 = $kOLD;
-        $temp2 = &similarities($temp2);
+        $temp2 = &similar_chars($temp2);
 
         #Check for similar punctuation (case insensitive) or
-        #similarity related to Course/Community 
-        if(lc($temp1) eq lc($temp2) || &CourseCommunity($temp1,$temp2)){
+        #similarity related to similar phrases 
+        if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) {
             #Find delimiter for key and value
             if (($kNEW=~/\'/) & ($kNEW=~/\"/)) {
                 print " (Warning: Both, ' and \", occur!)";
@@ -98,8 +112,8 @@ while( my ($kNEW, $vNEW) = each %langNEW
 	    } else {
 	        $dlm = "'";
 	    }
-            print OUT (<<ENDNEW);
-#Old key: $kOLD
+            print (<<ENDNEW);
+#   $kOLD #(Old key)
    $dlm$kNEW$dlm
 => $dlm$vOLD$dlm,