| print "Opening queue folder $queuedir\n"; | print "Opening queue folder $queuedir\n"; | ||||
| opendir(Q,$queuedir); | opendir(Q,$queuedir); | ||||
| foreach my $file ( readdir(Q) ) { | foreach my $file ( readdir(Q) ) { | ||||
| next if $file =~ /^\./; | |||||
| print "processing file $file\n"; | print "processing file $file\n"; | ||||
| if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { | if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { | ||||
| my $docid = $1; | my $docid = $1; | ||||
| } | } | ||||
| elsif ( $ext =~ /^(jpeg|png)$/ ) { | elsif ( $ext =~ /^(jpeg|png)$/ ) { | ||||
| print "\tdetecting image rotation\n"; | print "\tdetecting image rotation\n"; | ||||
| for(my $rot=0; $rot<=360; $rot+=90) { | |||||
| print "\t\ttrying $rot degrees rotation\n"; | |||||
| my @res; | |||||
| for(my $rot=0; $rot<360; $rot+=90) { | |||||
| print "\ttrying $rot degrees rotation\n"; | |||||
| my $tempfile = "/tmp/autodoc.$$.jpeg"; | my $tempfile = "/tmp/autodoc.$$.jpeg"; | ||||
| system(sprintf("convert %s/%s -rotate %s %s", | system(sprintf("convert %s/%s -rotate %s %s", | ||||
| $queuedir, $file, $rot, $tempfile)); | $queuedir, $file, $rot, $tempfile)); | ||||
| my($lang,$words) = detect_lang(ocr_file($tempfile)); | |||||
| print "\t\trunning OCR\n"; | |||||
| my $txt = ocr_file($tempfile); | |||||
| print "\t\tlanguage and dictionary detection\n"; | |||||
| my($lang,$words, $dictmatches) = detect_lang($txt); | |||||
| print Dumper($lang, $words); | |||||
| print "\t\tfound %dictmatches words in dictionary\n"; | |||||
| push @res, { | |||||
| lang => $lang, | |||||
| words => $words, | |||||
| dictmatches => $dictmatches | |||||
| }; | |||||
| unlink($tempfile); | unlink($tempfile); | ||||
| } | } | ||||
| print "\trunning OCR on page\n"; | |||||
| } | } | ||||
| else { | else { | ||||
| print "\terror: don't know how to process files of $ext type"; | print "\terror: don't know how to process files of $ext type"; | ||||
| my %lcnt; | my %lcnt; | ||||
| my @words; | my @words; | ||||
| my $dictwords=0; | |||||
| foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { | foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { | ||||
| next if length $word < 3; | next if length $word < 3; | ||||
| push @words, $word; | push @words, $word; | ||||
| my $lang; | my $lang; | ||||
| my $found=0; | |||||
| my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); | my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); | ||||
| while(my ($l) = $q->fetchrow_array()) { | while(my ($l) = $q->fetchrow_array()) { | ||||
| $lcnt{$l}++; | $lcnt{$l}++; | ||||
| $found=1; | |||||
| } | } | ||||
| $dictwords++ if $found; | |||||
| } | } | ||||
| print Dumper(\%lcnt); | print Dumper(\%lcnt); | ||||
| } | } | ||||
| } | } | ||||
| return ($lmax, \@words); | |||||
| return ($lmax, \@words, $dictwords); | |||||
| } | } | ||||
| sub update_page_status { | sub update_page_status { |