Browse Source

images ocr

master
Pascal Gloor 5 years ago
parent
commit
badf8640d3
1 changed files with 54 additions and 7 deletions
  1. 54
    7
      bin/autodoc_process.pl

+ 54
- 7
bin/autodoc_process.pl View File



$Data::Dumper::Sortkeys = 1; $Data::Dumper::Sortkeys = 1;


print "Loading configuration\n";
my $conf = load_conf("../etc/autodoc.json"); my $conf = load_conf("../etc/autodoc.json");

print "Connecting to database\n";
my $dbh = sqlconnect($conf->{sql}); my $dbh = sqlconnect($conf->{sql});


my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};


my %langid;
my %primary; my %primary;

print "Loading languages\n";
my %langid;
my $q = sqlquery($dbh, "SELECT id,short FROM lang"); my $q = sqlquery($dbh, "SELECT id,short FROM lang");
while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }


print "Opening queue folder $queuedir\n";
opendir(Q,$queuedir); opendir(Q,$queuedir);
foreach my $file ( readdir(Q) ) { foreach my $file ( readdir(Q) ) {
print "processing file $file\n";
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
my $docid = $1; my $docid = $1;
my $ext = $3; my $ext = $3;


print "Found document id $docid of type $ext\n";
print "\tdocument id $docid of type $ext\n";


if ( $ext eq 'pdf' ) { if ( $ext eq 'pdf' ) {
my @pages; my @pages;
for(my $page=0;; $page++) { for(my $page=0;; $page++) {
my $txt = ''; my $txt = '';
print "texting page $page\n";
open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
while(<TXT>) { while(<TXT>) {
chomp; chomp;
# end of PDF # end of PDF
last if $?; last if $?;


print "\t\textracted text from PDF for page $page\n";


my ($lang,$words) = detect_lang($txt); my ($lang,$words) = detect_lang($txt);
print "language is $lang\n";
#spell_check($txt,$lang);
print "\t\tdetected language is $lang\n";
my $pageid = get_new_page($docid); my $pageid = get_new_page($docid);
print "new page id $pageid\n";
print "\t\tcreated page id $pageid\n";
print "\t\tupdating page status to 'inprogress'\n";
update_page_status($pageid, 'inprogress'); update_page_status($pageid, 'inprogress');


if ( !exists $primary{$docid} ) { if ( !exists $primary{$docid} ) {
print "\t\tsetting document for default primary thumbnail\n";
$primary{$docid}=undef; $primary{$docid}=undef;
sqlquery($dbh, "CALL set_primary_page(?)",$pageid); sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
} }




print "create original page jpeg $pageid.jpeg";
print "\t\tcreating original page jpeg $pageid.jpeg";
system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
system(sprintf("mv %s/%s.jpg %s/%s.jpeg", system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
$originaldir, $pageid, $originaldir, $pageid,
$originaldir, $pageid )); $originaldir, $pageid ));


print "\t\tloading extracted words into database\n";
create_page_words($pageid, $lang, $words); create_page_words($pageid, $lang, $words);


print "\t\tupdating page status to 'ok'\n";
update_page_status($pageid, 'ok'); update_page_status($pageid, 'ok');


} }
} }
elsif ( $ext =~ /^(jpeg|png)$/ ) {
print "\tdetecting image rotation\n";
for(my $rot=0; $rot<=360; $rot+=90) {
print "\t\ttrying $rot degrees rotation\n";
my $tempfile = "/tmp/autodoc.$$.jpeg";
system(sprintf("convert %s/%s -rotate %s %s",
$queuedir, $file, $rot, $tempfile));

my($lang,$words) = detect_lang(ocr_file($tempfile));

print Dumper($lang, $words);

unlink($tempfile);
}
print "\trunning OCR on page\n";
}
else { else {
print "\terror: don't know how to process files of $ext type";
} }
} }
else {
print "\terror: file doesn't contain manadatory UUIDs in its name\n";
}
} }
closedir(Q); closedir(Q);


print "done\n";

# open queue # open queue
# decompose PDF # decompose PDF
# normalise all files as jpegs # normalise all files as jpegs
# ocr / lang detect # ocr / lang detect
# update db # update db


sub ocr_file {
my($file) = @_;
my $txt = '';

open(OCR,"tesseract -l eng+deu+fra+ita %s - |", $file));
while(<OCR>) {
$txt .= $_;
}
close(OCR);
return $txt;
}
sub create_page_words { sub create_page_words {
my($pageid, $lang, $words) = @_; my($pageid, $lang, $words) = @_;



Loading…
Cancel
Save