|
|
@@ -11,30 +11,36 @@ use warnings; |
|
|
|
|
|
|
|
$Data::Dumper::Sortkeys = 1; |
|
|
|
|
|
|
|
print "Loading configuration\n"; |
|
|
|
my $conf = load_conf("../etc/autodoc.json"); |
|
|
|
|
|
|
|
print "Connecting to database\n"; |
|
|
|
my $dbh = sqlconnect($conf->{sql}); |
|
|
|
|
|
|
|
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; |
|
|
|
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; |
|
|
|
|
|
|
|
my %langid; |
|
|
|
my %primary; |
|
|
|
|
|
|
|
print "Loading languages\n"; |
|
|
|
my %langid; |
|
|
|
my $q = sqlquery($dbh, "SELECT id,short FROM lang"); |
|
|
|
while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } |
|
|
|
|
|
|
|
print "Opening queue folder $queuedir\n"; |
|
|
|
opendir(Q,$queuedir); |
|
|
|
foreach my $file ( readdir(Q) ) { |
|
|
|
print "processing file $file\n"; |
|
|
|
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { |
|
|
|
my $docid = $1; |
|
|
|
my $ext = $3; |
|
|
|
|
|
|
|
print "Found document id $docid of type $ext\n"; |
|
|
|
print "\tdocument id $docid of type $ext\n"; |
|
|
|
|
|
|
|
if ( $ext eq 'pdf' ) { |
|
|
|
my @pages; |
|
|
|
for(my $page=0;; $page++) { |
|
|
|
my $txt = ''; |
|
|
|
print "texting page $page\n"; |
|
|
|
open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; |
|
|
|
while(<TXT>) { |
|
|
|
chomp; |
|
|
@@ -45,37 +51,66 @@ foreach my $file ( readdir(Q) ) { |
|
|
|
# end of PDF |
|
|
|
last if $?; |
|
|
|
|
|
|
|
print "\t\textracted text from PDF for page $page\n"; |
|
|
|
|
|
|
|
|
|
|
|
my ($lang,$words) = detect_lang($txt); |
|
|
|
print "language is $lang\n"; |
|
|
|
#spell_check($txt,$lang); |
|
|
|
print "\t\tdetected language is $lang\n"; |
|
|
|
|
|
|
|
my $pageid = get_new_page($docid); |
|
|
|
print "new page id $pageid\n"; |
|
|
|
print "\t\tcreated page id $pageid\n"; |
|
|
|
print "\t\tupdating page status to 'inprogress'\n"; |
|
|
|
update_page_status($pageid, 'inprogress'); |
|
|
|
|
|
|
|
if ( !exists $primary{$docid} ) { |
|
|
|
print "\t\tsetting document for default primary thumbnail\n"; |
|
|
|
$primary{$docid}=undef; |
|
|
|
sqlquery($dbh, "CALL set_primary_page(?)",$pageid); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
print "create original page jpeg $pageid.jpeg"; |
|
|
|
print "\t\tcreating original page jpeg $pageid.jpeg"; |
|
|
|
system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); |
|
|
|
system(sprintf("mv %s/%s.jpg %s/%s.jpeg", |
|
|
|
$originaldir, $pageid, |
|
|
|
$originaldir, $pageid )); |
|
|
|
|
|
|
|
print "\t\tloading extracted words into database\n"; |
|
|
|
create_page_words($pageid, $lang, $words); |
|
|
|
|
|
|
|
print "\t\tupdating page status to 'ok'\n"; |
|
|
|
update_page_status($pageid, 'ok'); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
elsif ( $ext =~ /^(jpeg|png)$/ ) { |
|
|
|
print "\tdetecting image rotation\n"; |
|
|
|
for(my $rot=0; $rot<=360; $rot+=90) { |
|
|
|
print "\t\ttrying $rot degrees rotation\n"; |
|
|
|
my $tempfile = "/tmp/autodoc.$$.jpeg"; |
|
|
|
system(sprintf("convert %s/%s -rotate %s %s", |
|
|
|
$queuedir, $file, $rot, $tempfile)); |
|
|
|
|
|
|
|
my($lang,$words) = detect_lang(ocr_file($tempfile)); |
|
|
|
|
|
|
|
print Dumper($lang, $words); |
|
|
|
|
|
|
|
unlink($tempfile); |
|
|
|
} |
|
|
|
print "\trunning OCR on page\n"; |
|
|
|
} |
|
|
|
else { |
|
|
|
print "\terror: don't know how to process files of $ext type"; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
print "\terror: file doesn't contain manadatory UUIDs in its name\n"; |
|
|
|
} |
|
|
|
} |
|
|
|
closedir(Q); |
|
|
|
|
|
|
|
print "done\n"; |
|
|
|
|
|
|
|
# open queue |
|
|
|
# decompose PDF |
|
|
|
# normalise all files as jpegs |
|
|
@@ -83,6 +118,18 @@ closedir(Q); |
|
|
|
# ocr / lang detect |
|
|
|
# update db |
|
|
|
|
|
|
|
sub ocr_file { |
|
|
|
my($file) = @_; |
|
|
|
my $txt = ''; |
|
|
|
|
|
|
|
open(OCR,"tesseract -l eng+deu+fra+ita %s - |", $file)); |
|
|
|
while(<OCR>) { |
|
|
|
$txt .= $_; |
|
|
|
} |
|
|
|
close(OCR); |
|
|
|
|
|
|
|
return $txt; |
|
|
|
} |
|
|
|
sub create_page_words { |
|
|
|
my($pageid, $lang, $words) = @_; |
|
|
|
|