loncom/localize/localize/checksimilar_2files.pl - view

File: [LON-CAPA] / loncom / localize / localize / checksimilar_2files.pl
Revision 1.4: download - view: text, annotated - select for diffs
Thu Jan 10 18:07:52 2013 UTC (11 years, 8 months ago) by bisitz
Branches: MAIN
CVS tags: HEAD

- Detect even more similarities by extending list of similar characters and phrases
- Script code:
    - Better fitting sub routine names
    - White spaces follow LON-CAPA standard

#!/usr/bin/perl # The LearningOnline Network with CAPA # $Id: checksimilar_2files.pl,v 1.4 2013/01/10 18:07:52 bisitz Exp $ use strict; use warnings; use utf8; use open ':utf8'; #### #### Checks, if there are similar keys in the two inputfiles. #### For example, check the current lang.pm (first input) and newphrases. #### So if there are similar keys you don't have to translate #### them again but use the old value and just modify it. #### IMPORTANT: Both inputfiles have to contain a hash %Lexicon (like lang.pm) !!! ####--------Subroutines--------#### sub read { # Read file into memory my $file = shift; open(IN,$file) or die; my %filecontent = (); my $contents = join('',<IN>); close(IN); # Build hash with hash from file my %Lexicon = (); eval($contents.'; %filecontent=%Lexicon;'); if ($@ ne "") { print "\nAn error occurred during the attempt to retrieve the translation hash.\n" ."Error: ".$@."\n"; die; } return %filecontent; } sub similar_chars { my $text = shift; $text =~ s/\[_\d\]//g; # translation parameters $text =~ s/[.,\_\-?!: \/]//g; # punctuation return $text; } sub similar_phrases { my $text1 = shift; my $text2 = shift; $text1 =~ s/courses/X001X/gi; $text1 =~ s/communities/X001X/gi; $text1 =~ s/course/X002X/gi; $text1 =~ s/community/X002X/gi; $text1 =~ s/member/X003X/gi; $text1 =~ s/student/X003X/gi; $text1 =~ s/students/X003X/gi; $text2 =~ s/courses/X001X/gi; $text2 =~ s/communities/X001X/gi; $text2 =~ s/course/X002X/gi; $text2 =~ s/community/X002X/gi; $text2 =~ s/member/X003X/gi; $text2 =~ s/student/X003X/gi; $text2 =~ s/students/X003X/gi; if (lc($text1) eq lc($text2)) { return 1; } return 0; } ####--------Main Program--------#### my $file1 = $ARGV[0]; # Old language.pm my $file2 = $ARGV[1]; # New Phrases print("Checking for similar expressions in phrases in $file1 and $file2...\n"); my %langOLD = &read($file1); #Hash with old phrases my %langNEW = &read($file2); #Hash with new phrases my $dlm; my $count = 0; # For each new phrase, check if there is already a similar one while( my ($kNEW, $vNEW) = each %langNEW ) { my $temp1 = $kNEW; $temp1 = &similar_chars($temp1); while( my ($kOLD, $vOLD) = each %langOLD ) { my $temp2 = $kOLD; $temp2 = &similar_chars($temp2); #Check for similar punctuation (case insensitive) or #similarity related to similar phrases if (lc($temp1) eq lc($temp2) || &similar_phrases($temp1,$temp2)) { #Find delimiter for key and value if (($kNEW=~/\'/) & ($kNEW=~/\"/)) { print " (Warning: Both, ' and \", occur!)"; } if ($kNEW=~/\'/) { $dlm = '"'; } else { $dlm = "'"; } print (<<ENDNEW); # $kOLD #(Old key) $dlm$kNEW$dlm => $dlm$vOLD$dlm, ENDNEW $count++; } } } print("Finished. ".$count." similar expressions found!\n");