Browse Source

ocr

master
Pascal Gloor 5 years ago
parent
commit
ebd4cbf94e
1 changed files with 21 additions and 6 deletions
  1. 21
    6
      bin/autodoc_process.pl

+ 21
- 6
bin/autodoc_process.pl View File

@@ -30,6 +30,7 @@ while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
print "Opening queue folder $queuedir\n";
opendir(Q,$queuedir);
foreach my $file ( readdir(Q) ) {
next if $file =~ /^\./;
print "processing file $file\n";
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
my $docid = $1;
@@ -85,19 +86,29 @@ foreach my $file ( readdir(Q) ) {
}
elsif ( $ext =~ /^(jpeg|png)$/ ) {
print "\tdetecting image rotation\n";
for(my $rot=0; $rot<=360; $rot+=90) {
print "\t\ttrying $rot degrees rotation\n";
my @res;
for(my $rot=0; $rot<360; $rot+=90) {
print "\ttrying $rot degrees rotation\n";
my $tempfile = "/tmp/autodoc.$$.jpeg";
system(sprintf("convert %s/%s -rotate %s %s",
$queuedir, $file, $rot, $tempfile));

my($lang,$words) = detect_lang(ocr_file($tempfile));
print "\t\trunning OCR\n";
my $txt = ocr_file($tempfile);
print "\t\tlanguage and dictionary detection\n";
my($lang,$words, $dictmatches) = detect_lang($txt);

print Dumper($lang, $words);
print "\t\tfound %dictmatches words in dictionary\n";

push @res, {
lang => $lang,
words => $words,
dictmatches => $dictmatches
};

unlink($tempfile);
}
print "\trunning OCR on page\n";
}
else {
print "\terror: don't know how to process files of $ext type";
@@ -159,6 +170,7 @@ sub detect_lang {

my %lcnt;
my @words;
my $dictwords=0;

foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
next if length $word < 3;
@@ -166,10 +178,13 @@ sub detect_lang {
push @words, $word;

my $lang;
my $found=0;
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
while(my ($l) = $q->fetchrow_array()) {
$lcnt{$l}++;
$found=1;
}
$dictwords++ if $found;
}

print Dumper(\%lcnt);
@@ -184,7 +199,7 @@ sub detect_lang {
}
}

return ($lmax, \@words);
return ($lmax, \@words, $dictwords);
}

sub update_page_status {

Loading…
Cancel
Save