Browse Source

ocr

master
Pascal Gloor 5 years ago
parent
commit
ebd4cbf94e
1 changed files with 21 additions and 6 deletions
  1. 21
    6
      bin/autodoc_process.pl

+ 21
- 6
bin/autodoc_process.pl View File

print "Opening queue folder $queuedir\n"; print "Opening queue folder $queuedir\n";
opendir(Q,$queuedir); opendir(Q,$queuedir);
foreach my $file ( readdir(Q) ) { foreach my $file ( readdir(Q) ) {
next if $file =~ /^\./;
print "processing file $file\n"; print "processing file $file\n";
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
my $docid = $1; my $docid = $1;
} }
elsif ( $ext =~ /^(jpeg|png)$/ ) { elsif ( $ext =~ /^(jpeg|png)$/ ) {
print "\tdetecting image rotation\n"; print "\tdetecting image rotation\n";
for(my $rot=0; $rot<=360; $rot+=90) {
print "\t\ttrying $rot degrees rotation\n";
my @res;
for(my $rot=0; $rot<360; $rot+=90) {
print "\ttrying $rot degrees rotation\n";
my $tempfile = "/tmp/autodoc.$$.jpeg"; my $tempfile = "/tmp/autodoc.$$.jpeg";
system(sprintf("convert %s/%s -rotate %s %s", system(sprintf("convert %s/%s -rotate %s %s",
$queuedir, $file, $rot, $tempfile)); $queuedir, $file, $rot, $tempfile));


my($lang,$words) = detect_lang(ocr_file($tempfile));
print "\t\trunning OCR\n";
my $txt = ocr_file($tempfile);
print "\t\tlanguage and dictionary detection\n";
my($lang,$words, $dictmatches) = detect_lang($txt);


print Dumper($lang, $words);
print "\t\tfound %dictmatches words in dictionary\n";

push @res, {
lang => $lang,
words => $words,
dictmatches => $dictmatches
};


unlink($tempfile); unlink($tempfile);
} }
print "\trunning OCR on page\n";
} }
else { else {
print "\terror: don't know how to process files of $ext type"; print "\terror: don't know how to process files of $ext type";


my %lcnt; my %lcnt;
my @words; my @words;
my $dictwords=0;


foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
next if length $word < 3; next if length $word < 3;
push @words, $word; push @words, $word;


my $lang; my $lang;
my $found=0;
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
while(my ($l) = $q->fetchrow_array()) { while(my ($l) = $q->fetchrow_array()) {
$lcnt{$l}++; $lcnt{$l}++;
$found=1;
} }
$dictwords++ if $found;
} }


print Dumper(\%lcnt); print Dumper(\%lcnt);
} }
} }


return ($lmax, \@words);
return ($lmax, \@words, $dictwords);
} }


sub update_page_status { sub update_page_status {

Loading…
Cancel
Save