#!/usr/bin/perl use strict; use JSON; use DBI; use GD::Simple; use Data::Dumper; use Data::UUID; use File::Temp; use warnings; $Data::Dumper::Sortkeys = 1; print "Loading configuration\n"; my $conf = load_conf("../etc/autodoc.json"); print "Connecting to database\n"; my $dbh = sqlconnect($conf->{sql}); my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; my %primary; print "Loading languages\n"; my %langid; my $q = sqlquery($dbh, "SELECT id,short FROM lang"); while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } print "Opening queue folder $queuedir\n"; opendir(Q,$queuedir); foreach my $file ( readdir(Q) ) { next if $file =~ /^\./; print "processing file $file\n"; if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { my $docid = $1; my $ext = $3; print "\tdocument id $docid of type $ext\n"; if ( $ext eq 'pdf' ) { my @pages; for(my $page=0;; $page++) { my $txt = ''; open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; while() { chomp; $txt .= ' ' . $_; } close(TXT); # end of PDF last if $?; print "\t\textracted text from PDF for page $page\n"; my ($lang,$words) = detect_lang($txt); print "\t\tdetected language is $lang\n"; my $pageid = get_new_page($docid); print "\t\tcreated page id $pageid\n"; print "\t\tupdating page status to 'inprogress'\n"; update_page_status($pageid, 'inprogress'); if ( !exists $primary{$docid} ) { print "\t\tsetting document for default primary thumbnail\n"; $primary{$docid}=undef; sqlquery($dbh, "CALL set_primary_page(?)",$pageid); } print "\t\tcreating original page jpeg $pageid.jpeg"; system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); system(sprintf("mv %s/%s.jpg %s/%s.jpeg", $originaldir, $pageid, $originaldir, $pageid )); print "\t\tloading extracted words into database\n"; create_page_words($pageid, $lang, $words); print "\t\tupdating page status to 'ok'\n"; update_page_status($pageid, 'ok'); print "\tdone\n"; } } elsif ( $ext =~ /^(jpeg|png)$/ ) { print "\tdetecting image rotation\n"; my %res; for(my $rot=0; $rot<360; $rot+=90) { print "\ttrying $rot degrees rotation\n"; my $tempfile = "/tmp/autodoc.$$.jpeg"; system(sprintf("convert %s/%s -rotate %s %s", $queuedir, $file, $rot, $tempfile)); print "\t\trunning OCR\n"; my $txt = ocr_file($tempfile); print "\t\tlanguage and dictionary detection\n"; my($lang,$words, $dictmatches) = detect_lang($txt); print "\t\tfound $dictmatches words in dictionary\n"; $res{$rot} = { lang => $lang, words => $words, dictmatches => $dictmatches }; unlink($tempfile); } my $maxwords = 0; my $bestrot; foreach my $rot ( keys %res ) { $bestrot=$rot if !defined $bestrot; if ( $maxwords < $res{$rot}{dictmatches} ) { $maxwords = $res{$rot}{dictmatches}; $bestrot = $rot; } } print "\tbest OCR results with $bestrot rotation\n"; my $pageid = get_new_page($docid); print "\t\tcreated page id $pageid\n"; print "\t\tupdating page status to 'inprogress'\n"; update_page_status($pageid, 'inprogress'); if ( !exists $primary{$docid} ) { print "\t\tsetting document for default primary thumbnail\n"; $primary{$docid}=undef; sqlquery($dbh, "CALL set_primary_page(?)",$pageid); } print "\t\tcreating original page jpeg $pageid.jpeg"; system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid)); print "\t\tloading extracted words into database\n"; create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words}); print "\t\tupdating page status to 'ok'\n"; update_page_status($pageid, 'ok'); print "\tdone\n"; } else { print "\terror: don't know how to process files of $ext type"; } } else { print "\terror: file doesn't contain manadatory UUIDs in its name\n"; } } closedir(Q); print "done\n"; # open queue # decompose PDF # normalise all files as jpegs # generate page # ocr / lang detect # update db sub ocr_file { my($file) = @_; my $txt = ''; open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file)); while() { $txt .= $_; } close(OCR); return $txt; } sub create_page_words { my($pageid, $lang, $words) = @_; foreach my $word ( @{$words} ) { sqlquery($dbh, "CALL add_page_word(?,?,?)", $pageid, $word, $langid{$lang}); } } sub spell_check { my($txt, $lang) = @_; my $tmp = File::Temp->new(); print $tmp $txt; open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang)); while() { print $_; } close(F); } sub detect_lang { my($txt) = @_; my %lcnt; my @words; my $dictwords=0; foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { next if length $word < 3; $word = lc($word); push @words, $word; my $lang; my $found=0; my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); while(my ($l) = $q->fetchrow_array()) { $lcnt{$l}++; $found=1; } $dictwords++ if $found; } #print Dumper(\%lcnt); my $max = 0; my $lmax; foreach my $lang ( keys %lcnt ) { $lmax = $lang if !defined $lmax; if ( $lcnt{$lang} > $max ) { $lmax = $lang; $max = $lcnt{$lang}; } } return ($lmax, \@words, $dictwords); } sub update_page_status { my($pageid, $status) = @_; sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status); } sub get_new_page { my($docid) = @_; my $pageid; my $q = sqlquery($dbh, "CALL create_page(?)", $docid); while(my($id)=$q->fetchrow_array()) { $pageid = $id; } return $pageid; } sub gen_uuid { my $ug = Data::UUID->new; return lc($ug->create_str()); } sub load_conf { my($file) = @_; my $x=''; open(F,"$file") || die "Failed to load configuration file"; while() { $x.=$_; } close(F); return from_json($x); } sub sqlconnect { my($sql) = @_; my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ die "Failed to connect to database"; return $dbh; } sub sqlquery { my $dbh = shift; my $query = shift; my @args = @_; #print STDERR "$query\n"; my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; $sth->execute(@args) || die "Failed to execute SQL query"; return $sth; }