| $Data::Dumper::Sortkeys = 1; | $Data::Dumper::Sortkeys = 1; | ||||
| print "Loading configuration\n"; | |||||
| my $conf = load_conf("../etc/autodoc.json"); | my $conf = load_conf("../etc/autodoc.json"); | ||||
| print "Connecting to database\n"; | |||||
| my $dbh = sqlconnect($conf->{sql}); | my $dbh = sqlconnect($conf->{sql}); | ||||
| my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; | my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; | ||||
| my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; | my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; | ||||
| my %langid; | |||||
| my %primary; | my %primary; | ||||
| print "Loading languages\n"; | |||||
| my %langid; | |||||
| my $q = sqlquery($dbh, "SELECT id,short FROM lang"); | my $q = sqlquery($dbh, "SELECT id,short FROM lang"); | ||||
| while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } | while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } | ||||
| print "Opening queue folder $queuedir\n"; | |||||
| opendir(Q,$queuedir); | opendir(Q,$queuedir); | ||||
| foreach my $file ( readdir(Q) ) { | foreach my $file ( readdir(Q) ) { | ||||
| print "processing file $file\n"; | |||||
| if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { | if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { | ||||
| my $docid = $1; | my $docid = $1; | ||||
| my $ext = $3; | my $ext = $3; | ||||
| print "Found document id $docid of type $ext\n"; | |||||
| print "\tdocument id $docid of type $ext\n"; | |||||
| if ( $ext eq 'pdf' ) { | if ( $ext eq 'pdf' ) { | ||||
| my @pages; | my @pages; | ||||
| for(my $page=0;; $page++) { | for(my $page=0;; $page++) { | ||||
| my $txt = ''; | my $txt = ''; | ||||
| print "texting page $page\n"; | |||||
| open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; | open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; | ||||
| while(<TXT>) { | while(<TXT>) { | ||||
| chomp; | chomp; | ||||
| # end of PDF | # end of PDF | ||||
| last if $?; | last if $?; | ||||
| print "\t\textracted text from PDF for page $page\n"; | |||||
| my ($lang,$words) = detect_lang($txt); | my ($lang,$words) = detect_lang($txt); | ||||
| print "language is $lang\n"; | |||||
| #spell_check($txt,$lang); | |||||
| print "\t\tdetected language is $lang\n"; | |||||
| my $pageid = get_new_page($docid); | my $pageid = get_new_page($docid); | ||||
| print "new page id $pageid\n"; | |||||
| print "\t\tcreated page id $pageid\n"; | |||||
| print "\t\tupdating page status to 'inprogress'\n"; | |||||
| update_page_status($pageid, 'inprogress'); | update_page_status($pageid, 'inprogress'); | ||||
| if ( !exists $primary{$docid} ) { | if ( !exists $primary{$docid} ) { | ||||
| print "\t\tsetting document for default primary thumbnail\n"; | |||||
| $primary{$docid}=undef; | $primary{$docid}=undef; | ||||
| sqlquery($dbh, "CALL set_primary_page(?)",$pageid); | sqlquery($dbh, "CALL set_primary_page(?)",$pageid); | ||||
| } | } | ||||
| print "create original page jpeg $pageid.jpeg"; | |||||
| print "\t\tcreating original page jpeg $pageid.jpeg"; | |||||
| system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); | system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); | ||||
| system(sprintf("mv %s/%s.jpg %s/%s.jpeg", | system(sprintf("mv %s/%s.jpg %s/%s.jpeg", | ||||
| $originaldir, $pageid, | $originaldir, $pageid, | ||||
| $originaldir, $pageid )); | $originaldir, $pageid )); | ||||
| print "\t\tloading extracted words into database\n"; | |||||
| create_page_words($pageid, $lang, $words); | create_page_words($pageid, $lang, $words); | ||||
| print "\t\tupdating page status to 'ok'\n"; | |||||
| update_page_status($pageid, 'ok'); | update_page_status($pageid, 'ok'); | ||||
| } | } | ||||
| } | } | ||||
| elsif ( $ext =~ /^(jpeg|png)$/ ) { | |||||
| print "\tdetecting image rotation\n"; | |||||
| for(my $rot=0; $rot<=360; $rot+=90) { | |||||
| print "\t\ttrying $rot degrees rotation\n"; | |||||
| my $tempfile = "/tmp/autodoc.$$.jpeg"; | |||||
| system(sprintf("convert %s/%s -rotate %s %s", | |||||
| $queuedir, $file, $rot, $tempfile)); | |||||
| my($lang,$words) = detect_lang(ocr_file($tempfile)); | |||||
| print Dumper($lang, $words); | |||||
| unlink($tempfile); | |||||
| } | |||||
| print "\trunning OCR on page\n"; | |||||
| } | |||||
| else { | else { | ||||
| print "\terror: don't know how to process files of $ext type"; | |||||
| } | } | ||||
| } | } | ||||
| else { | |||||
| print "\terror: file doesn't contain manadatory UUIDs in its name\n"; | |||||
| } | |||||
| } | } | ||||
| closedir(Q); | closedir(Q); | ||||
| print "done\n"; | |||||
| # open queue | # open queue | ||||
| # decompose PDF | # decompose PDF | ||||
| # normalise all files as jpegs | # normalise all files as jpegs | ||||
| # ocr / lang detect | # ocr / lang detect | ||||
| # update db | # update db | ||||
| sub ocr_file { | |||||
| my($file) = @_; | |||||
| my $txt = ''; | |||||
| open(OCR,"tesseract -l eng+deu+fra+ita %s - |", $file)); | |||||
| while(<OCR>) { | |||||
| $txt .= $_; | |||||
| } | |||||
| close(OCR); | |||||
| return $txt; | |||||
| } | |||||
| sub create_page_words { | sub create_page_words { | ||||
| my($pageid, $lang, $words) = @_; | my($pageid, $lang, $words) = @_; | ||||