loncom/localize/localize/checksimilar_2files.pl - view

File: [LON-CAPA] / loncom / localize / localize / checksimilar_2files.pl
Revision 1.7: download - view: text, annotated - select for diffs
Wed Sep 25 13:22:42 2013 UTC (10 years, 9 months ago) by bisitz
Branches: MAIN
CVS tags: HEAD

Also ignore brackets for better matching of similar phrases.

1: #!/usr/bin/perl 2: # The LearningOnline Network with CAPA 3: # $Id: checksimilar_2files.pl,v 1.7 2013/09/25 13:22:42 bisitz Exp $ 4: 5: use strict; 6: use warnings; 7: use utf8; 8: use open ':utf8'; 9: 10: #### 11: #### Checks, if there are similar keys in the two inputfiles. 12: #### For example, check the current lang.pm (first input) and newphrases. 13: #### So if there are similar keys you don't have to translate 14: #### them again but use the old value and just modify it. 15: #### IMPORTANT: Both inputfiles have to contain a hash %Lexicon (like lang.pm) !!! 16: 17: 18: ####--------Subroutines--------#### 19: 20: sub read { 21: # Read file into memory 22: my $file = shift; 23: open(IN,$file) or die "Error: Could not open file: $file\n"; 24: my %filecontent = (); 25: my $contents = join('',<IN>); 26: close(IN); 27: # Build hash with hash from file 28: my %Lexicon = (); 29: eval($contents.'; %filecontent=%Lexicon;'); 30: if ($@ ne "") { 31: die "\nAn error occurred during the attempt to retrieve the translation hash.\n" 32: ."Error: ".$@."\n"; 33: } 34: return %filecontent; 35: } 36: 37: sub similar_chars { 38: my $text = shift; 39: $text =~ s/\[_\d\]//g; # translation parameters 40: $text =~ s/[.,\_\-?!: \/]//g; # punctuation 41: return $text; 42: } 43: 44: 45: 46: sub similar_phrases { 47: 48: my $text1 = shift; 49: my $text2 = shift; 50: 51: my %phrases = ( 52: 'courses' => 1, 53: 'communities' => 1, 54: 'course' => 2, 55: 'community' => 2, 56: 'member' => 3, 57: 'student' => 3, 58: 'students' => 3, 59: 'construction'=> 4, 60: 'authoring' => 4, 61: ); 62: 63: foreach my $word (keys %phrases) { 64: $text1 =~ s/$word/X$phrases{$word}X/gi; 65: $text2 =~ s/$word/X$phrases{$word}X/gi; 66: } 67: 68: if (lc($text1) eq lc($text2)) { 69: return 1; 70: } 71: 72: return 0; 73: } 74: 75: 76: 77: ####--------Main Program--------#### 78: 79: if (!$ARGV[0] or !$ARGV[1]) { 80: die 'Error: Invalid files! Please specify two files which should be checked.'."\n"; 81: } 82: 83: my $file1 = $ARGV[0]; # Old language.pm 84: my $file2 = $ARGV[1]; # New Phrases 85: 86: print("Checking for similar expressions in phrases in $file1 and $file2...\n"); 87: 88: my %langOLD = &read($file1); #Hash with old phrases 89: my %langNEW = &read($file2); #Hash with new phrases 90: my $dlm; 91: my $count = 0; 92: 93: # For each new phrase, check if there is already a similar one 94: while( my ($kNEW, $vNEW) = each %langNEW ) { 95: my $temp1 = $kNEW; 96: $temp1 = &similar_chars($temp1); 97: 98: while( my ($kOLD, $vOLD) = each %langOLD ) { 99: my $temp2 = $kOLD; 100: $temp2 = &similar_chars($temp2); 101: 102: #Check for similar punctuation (case insensitive) or 103: #similarity related to similar phrases 104: if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) { 105: #Find delimiter for key and value 106: if (($kNEW=~/\'/) & ($kNEW=~/\"/)) { 107: print " (Warning: Both, ' and \", occur!)"; 108: } 109: if ($kNEW=~/\'/) { 110: $dlm = '"'; 111: } else { 112: $dlm = "'"; 113: } 114: print (<<ENDNEW); 115: # $kOLD #(Old key) 116: $dlm$kNEW$dlm 117: => $dlm$vOLD$dlm, 118: 119: ENDNEW 120: $count++; 121: 122: } 123: } 124: } 125: print("Finished. ".$count." similar expressions found!\n"); 126: 127: