| @@ -30,6 +30,7 @@ while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } | |||
| print "Opening queue folder $queuedir\n"; | |||
| opendir(Q,$queuedir); | |||
| foreach my $file ( readdir(Q) ) { | |||
| next if $file =~ /^\./; | |||
| print "processing file $file\n"; | |||
| if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { | |||
| my $docid = $1; | |||
| @@ -85,19 +86,29 @@ foreach my $file ( readdir(Q) ) { | |||
| } | |||
| elsif ( $ext =~ /^(jpeg|png)$/ ) { | |||
| print "\tdetecting image rotation\n"; | |||
| for(my $rot=0; $rot<=360; $rot+=90) { | |||
| print "\t\ttrying $rot degrees rotation\n"; | |||
| my @res; | |||
| for(my $rot=0; $rot<360; $rot+=90) { | |||
| print "\ttrying $rot degrees rotation\n"; | |||
| my $tempfile = "/tmp/autodoc.$$.jpeg"; | |||
| system(sprintf("convert %s/%s -rotate %s %s", | |||
| $queuedir, $file, $rot, $tempfile)); | |||
| my($lang,$words) = detect_lang(ocr_file($tempfile)); | |||
| print "\t\trunning OCR\n"; | |||
| my $txt = ocr_file($tempfile); | |||
| print "\t\tlanguage and dictionary detection\n"; | |||
| my($lang,$words, $dictmatches) = detect_lang($txt); | |||
| print Dumper($lang, $words); | |||
| print "\t\tfound %dictmatches words in dictionary\n"; | |||
| push @res, { | |||
| lang => $lang, | |||
| words => $words, | |||
| dictmatches => $dictmatches | |||
| }; | |||
| unlink($tempfile); | |||
| } | |||
| print "\trunning OCR on page\n"; | |||
| } | |||
| else { | |||
| print "\terror: don't know how to process files of $ext type"; | |||
| @@ -159,6 +170,7 @@ sub detect_lang { | |||
| my %lcnt; | |||
| my @words; | |||
| my $dictwords=0; | |||
| foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { | |||
| next if length $word < 3; | |||
| @@ -166,10 +178,13 @@ sub detect_lang { | |||
| push @words, $word; | |||
| my $lang; | |||
| my $found=0; | |||
| my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); | |||
| while(my ($l) = $q->fetchrow_array()) { | |||
| $lcnt{$l}++; | |||
| $found=1; | |||
| } | |||
| $dictwords++ if $found; | |||
| } | |||
| print Dumper(\%lcnt); | |||
| @@ -184,7 +199,7 @@ sub detect_lang { | |||
| } | |||
| } | |||
| return ($lmax, \@words); | |||
| return ($lmax, \@words, $dictwords); | |||
| } | |||
| sub update_page_status { | |||