|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295 |
- #!/usr/bin/perl
-
- use strict;
- use JSON;
- use DBI;
- use utf8;
- use GD::Simple;
- use Data::Dumper;
- use Data::UUID;
- use File::Temp;
- use warnings;
-
- $Data::Dumper::Sortkeys = 1;
-
- print "Loading configuration\n";
- my $conf = load_conf("../etc/autodoc.json");
-
- print "Connecting to database\n";
- my $dbh = sqlconnect($conf->{sql});
-
- my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
- my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
-
- my %primary;
-
- print "Loading languages\n";
- my %langid;
- my $q = sqlquery($dbh, "SELECT id,short FROM lang");
- while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
-
- print "Opening queue folder $queuedir\n";
- opendir(Q,$queuedir);
- foreach my $file ( readdir(Q) ) {
- next if $file =~ /^\./;
- print "processing file $file\n";
- if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
- my $docid = $1;
- my $ext = $3;
-
- print "\tdocument id $docid of type $ext\n";
-
- if ( $ext eq 'pdf' ) {
- my @pages;
- for(my $page=0;; $page++) {
- my $txt = '';
- open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
- while(<TXT>) {
- chomp;
- $txt .= ' ' . $_;
- }
- close(TXT);
-
- # end of PDF
- last if $?;
-
- print "\t\textracted text from PDF for page $page\n";
-
-
- my ($lang,$words) = detect_lang($txt);
- print "\t\tdetected language is $lang\n";
-
- my $pageid = get_new_page($docid);
- print "\t\tcreated page id $pageid\n";
- print "\t\tupdating page status to 'inprogress'\n";
- update_page_status($pageid, 'inprogress');
-
- if ( !exists $primary{$docid} ) {
- print "\t\tsetting document for default primary thumbnail\n";
- $primary{$docid}=undef;
- sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
- }
-
-
- print "\t\tcreating original page jpeg $pageid.jpeg";
- system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
- system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
- $originaldir, $pageid,
- $originaldir, $pageid ));
-
- print "\t\tloading extracted words into database\n";
- create_page_words($pageid, $lang, $words);
-
- print "\t\tupdating page status to 'ok'\n";
- update_page_status($pageid, 'ok');
- print "\tdone\n";
- }
- }
- elsif ( $ext =~ /^(jpeg|png)$/ ) {
- print "\tdetecting image rotation\n";
- my %res;
- for(my $rot=0; $rot<360; $rot+=90) {
- print "\ttrying $rot degrees rotation\n";
- my $tempfile = "/tmp/autodoc.$$.jpeg";
- system(sprintf("convert %s/%s -rotate %s %s",
- $queuedir, $file, $rot, $tempfile));
-
- print "\t\trunning OCR\n";
- my $txt = ocr_file($tempfile);
- print "\t\tlanguage and dictionary detection\n";
- my($lang,$words, $dictmatches) = detect_lang($txt);
-
- print "\t\tfound $dictmatches words in dictionary\n";
-
- $res{$rot} = {
- lang => $lang,
- words => $words,
- dictmatches => $dictmatches
- };
-
- unlink($tempfile);
- }
-
- my $maxwords = 0;
- my $bestrot;
- foreach my $rot ( keys %res ) {
- $bestrot=$rot if !defined $bestrot;
- if ( $maxwords < $res{$rot}{dictmatches} ) {
- $maxwords = $res{$rot}{dictmatches};
- $bestrot = $rot;
- }
- }
-
- print "\tbest OCR results with $bestrot rotation\n";
-
- my $pageid = get_new_page($docid);
- print "\t\tcreated page id $pageid\n";
-
- print "\t\tupdating page status to 'inprogress'\n";
- update_page_status($pageid, 'inprogress');
-
- if ( !exists $primary{$docid} ) {
- print "\t\tsetting document for default primary thumbnail\n";
- $primary{$docid}=undef;
- sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
- }
- print "\t\tcreating original page jpeg $pageid.jpeg";
- system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));
-
- print "\t\tloading extracted words into database\n";
- create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
- print "\t\tupdating page status to 'ok'\n";
- update_page_status($pageid, 'ok');
- print "\tdone\n";
- }
- else {
- print "\terror: don't know how to process files of $ext type";
- }
- }
- else {
- print "\terror: file doesn't contain manadatory UUIDs in its name\n";
- }
- print "\tdeleting $file\n";
- unlink("$queuedir/$file");
- }
- closedir(Q);
-
- print "done\n";
-
- # open queue
- # decompose PDF
- # normalise all files as jpegs
- # generate page
- # ocr / lang detect
- # update db
-
- sub ocr_file {
- my($file) = @_;
- my $txt = '';
-
- open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
- while(<OCR>) {
- $txt .= $_;
- }
- close(OCR);
-
- return $txt;
- }
- sub create_page_words {
- my($pageid, $lang, $words) = @_;
-
- foreach my $word ( @{$words} ) {
- sqlquery($dbh, "CALL add_page_word(?,?,?)",
- $pageid, $word, $langid{$lang});
- }
-
- }
-
- sub spell_check {
- my($txt, $lang) = @_;
-
- my $tmp = File::Temp->new();
- print $tmp $txt;
-
- open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
- while(<F>) {
- print $_;
- }
- close(F);
- }
-
-
- sub detect_lang {
- my($txt) = @_;
-
- my %lcnt;
- my @words;
- my $dictwords=0;
-
- foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
- next if length $word < 3;
- $word = lc($word);
- push @words, $word;
-
- my $lang;
- my $found=0;
- my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
- while(my ($l) = $q->fetchrow_array()) {
- $lcnt{$l}++;
- $found=1;
- }
- $dictwords++ if $found;
- }
-
- #print Dumper(\%lcnt);
-
- my $max = 0;
- my $lmax;
- foreach my $lang ( keys %lcnt ) {
- $lmax = $lang if !defined $lmax;
- if ( $lcnt{$lang} > $max ) {
- $lmax = $lang;
- $max = $lcnt{$lang};
- }
- }
-
- return ($lmax, \@words, $dictwords);
- }
-
- sub update_page_status {
- my($pageid, $status) = @_;
-
- sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
- }
-
- sub get_new_page {
- my($docid) = @_;
-
- my $pageid;
-
- my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
- while(my($id)=$q->fetchrow_array()) {
- $pageid = $id;
- }
-
- return $pageid;
- }
-
- sub gen_uuid {
- my $ug = Data::UUID->new;
- return lc($ug->create_str());
- }
-
- sub load_conf {
- my($file) = @_;
-
- my $x='';
-
- open(F,"$file") || die "Failed to load configuration file";
- while(<F>) { $x.=$_; }
- close(F);
-
- return from_json($x);
- }
-
- sub sqlconnect {
- my($sql) = @_;
-
- my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
- my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}, { mysql_enable_utf8 => 1 }) || \
- die "Failed to connect to database";
-
- return $dbh;
- }
-
- sub sqlquery {
- my $dbh = shift;
- my $query = shift;
- my @args = @_;
-
- #print STDERR "$query\n";
-
- my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
- $sth->execute(@args) || die "Failed to execute SQL query";
- return $sth;
- }
|