Explorar el Código

finished image ocr

master
Pascal Gloor hace 5 años
padre
commit
e16f557794
Se han modificado 1 ficheros con 35 adiciones y 4 borrados
  1. 35
    4
      bin/autodoc_process.pl

+ 35
- 4
bin/autodoc_process.pl Ver fichero

@@ -81,12 +81,12 @@ foreach my $file ( readdir(Q) ) {

print "\t\tupdating page status to 'ok'\n";
update_page_status($pageid, 'ok');
print "\tdone\n";
}
}
elsif ( $ext =~ /^(jpeg|png)$/ ) {
print "\tdetecting image rotation\n";
my @res;
my %res;
for(my $rot=0; $rot<360; $rot+=90) {
print "\ttrying $rot degrees rotation\n";
my $tempfile = "/tmp/autodoc.$$.jpeg";
@@ -100,7 +100,7 @@ foreach my $file ( readdir(Q) ) {

print "\t\tfound $dictmatches words in dictionary\n";

push @res, {
$res{$rot} = {
lang => $lang,
words => $words,
dictmatches => $dictmatches
@@ -109,6 +109,37 @@ foreach my $file ( readdir(Q) ) {
unlink($tempfile);
}

my $maxwords = 0;
my $bestrot;
foreach my $rot ( keys %res ) {
$bestrot=$rot if !defined $bestrot;
if ( $maxwords < $res{$rot}{dictmatches} ) {
$maxwords = $res{$rot}{dictmatches};
$bestrot = $rot;
}
}

print "\tbest OCR results with $bestrot rotation\n";

my $pageid = get_new_page($docid);
print "\t\tcreated page id $pageid\n";

print "\t\tupdating page status to 'inprogress'\n";
update_page_status($pageid, 'inprogress');

if ( !exists $primary{$docid} ) {
print "\t\tsetting document for default primary thumbnail\n";
$primary{$docid}=undef;
sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
}
print "\t\tcreating original page jpeg $pageid.jpeg";
system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));

print "\t\tloading extracted words into database\n";
create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
print "\t\tupdating page status to 'ok'\n";
update_page_status($pageid, 'ok');
print "\tdone\n";
}
else {
print "\terror: don't know how to process files of $ext type";
@@ -187,7 +218,7 @@ sub detect_lang {
$dictwords++ if $found;
}

print Dumper(\%lcnt);
#print Dumper(\%lcnt);

my $max = 0;
my $lmax;

Cargando…
Cancelar
Guardar