|
|
|
|
|
|
|
|
print "Opening queue folder $queuedir\n"; |
|
|
print "Opening queue folder $queuedir\n"; |
|
|
opendir(Q,$queuedir); |
|
|
opendir(Q,$queuedir); |
|
|
foreach my $file ( readdir(Q) ) { |
|
|
foreach my $file ( readdir(Q) ) { |
|
|
|
|
|
next if $file =~ /^\./; |
|
|
print "processing file $file\n"; |
|
|
print "processing file $file\n"; |
|
|
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { |
|
|
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { |
|
|
my $docid = $1; |
|
|
my $docid = $1; |
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
elsif ( $ext =~ /^(jpeg|png)$/ ) { |
|
|
elsif ( $ext =~ /^(jpeg|png)$/ ) { |
|
|
print "\tdetecting image rotation\n"; |
|
|
print "\tdetecting image rotation\n"; |
|
|
for(my $rot=0; $rot<=360; $rot+=90) { |
|
|
|
|
|
print "\t\ttrying $rot degrees rotation\n"; |
|
|
|
|
|
|
|
|
my @res; |
|
|
|
|
|
for(my $rot=0; $rot<360; $rot+=90) { |
|
|
|
|
|
print "\ttrying $rot degrees rotation\n"; |
|
|
my $tempfile = "/tmp/autodoc.$$.jpeg"; |
|
|
my $tempfile = "/tmp/autodoc.$$.jpeg"; |
|
|
system(sprintf("convert %s/%s -rotate %s %s", |
|
|
system(sprintf("convert %s/%s -rotate %s %s", |
|
|
$queuedir, $file, $rot, $tempfile)); |
|
|
$queuedir, $file, $rot, $tempfile)); |
|
|
|
|
|
|
|
|
my($lang,$words) = detect_lang(ocr_file($tempfile)); |
|
|
|
|
|
|
|
|
print "\t\trunning OCR\n"; |
|
|
|
|
|
my $txt = ocr_file($tempfile); |
|
|
|
|
|
print "\t\tlanguage and dictionary detection\n"; |
|
|
|
|
|
my($lang,$words, $dictmatches) = detect_lang($txt); |
|
|
|
|
|
|
|
|
print Dumper($lang, $words); |
|
|
|
|
|
|
|
|
print "\t\tfound %dictmatches words in dictionary\n"; |
|
|
|
|
|
|
|
|
|
|
|
push @res, { |
|
|
|
|
|
lang => $lang, |
|
|
|
|
|
words => $words, |
|
|
|
|
|
dictmatches => $dictmatches |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
unlink($tempfile); |
|
|
unlink($tempfile); |
|
|
} |
|
|
} |
|
|
print "\trunning OCR on page\n"; |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
else { |
|
|
else { |
|
|
print "\terror: don't know how to process files of $ext type"; |
|
|
print "\terror: don't know how to process files of $ext type"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
my %lcnt; |
|
|
my %lcnt; |
|
|
my @words; |
|
|
my @words; |
|
|
|
|
|
my $dictwords=0; |
|
|
|
|
|
|
|
|
foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { |
|
|
foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { |
|
|
next if length $word < 3; |
|
|
next if length $word < 3; |
|
|
|
|
|
|
|
|
push @words, $word; |
|
|
push @words, $word; |
|
|
|
|
|
|
|
|
my $lang; |
|
|
my $lang; |
|
|
|
|
|
my $found=0; |
|
|
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); |
|
|
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); |
|
|
while(my ($l) = $q->fetchrow_array()) { |
|
|
while(my ($l) = $q->fetchrow_array()) { |
|
|
$lcnt{$l}++; |
|
|
$lcnt{$l}++; |
|
|
|
|
|
$found=1; |
|
|
} |
|
|
} |
|
|
|
|
|
$dictwords++ if $found; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
print Dumper(\%lcnt); |
|
|
print Dumper(\%lcnt); |
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return ($lmax, \@words); |
|
|
|
|
|
|
|
|
return ($lmax, \@words, $dictwords); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
sub update_page_status { |
|
|
sub update_page_status { |