public
/
autodoc

#!/usr/bin/perl

use strict;
use lib '/opt/autodoc/lib';
use Autodoc;
use JSON;
use DBI;
use utf8;
use GD::Simple;
use Data::Dumper;
use Data::UUID;
use File::Temp;
use warnings;

$Data::Dumper::Sortkeys = 1;

print "Loading configuration\n";
my $conf = load_conf("../etc/autodoc.json");

print "Connecting to database\n";
my $dbh  = sqlconnect($conf->{sql});

my $queuedir    = $conf->{path}{global}.'/'.$conf->{path}{queue};
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};

my %primary;

print "Loading languages\n";
my %langid;
my $q = sqlquery($dbh, "SELECT id,short FROM languages");
while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }

print "Opening queue folder $queuedir\n";
opendir(Q,$queuedir);
foreach my $file ( readdir(Q) ) {
	next if $file =~ /^\./;
	print "processing file $file\n";
	if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
		my $docid = $1;
		my $ext = $3;

		print "\tdocument id $docid of type $ext\n";

		if ( $ext eq 'pdf' ) {
			my @pages;
			for(my $page=0;; $page++) {
				my $txt = '';
				open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
				while(<TXT>) {
					chomp;
					$txt .= ' ' . $_;
				}
				close(TXT);

				# end of PDF
				last if $?;

				print "\t\textracted text from PDF for page $page\n";


				my ($lang,$words) = detect_lang($txt);
				print "\t\tdetected language is $lang\n";

				my $pageid = get_new_page($docid);
				print "\t\tcreated page id $pageid\n";
				print "\t\tupdating page status to 'inprogress'\n";
				update_page_status($pageid, 'inprogress');

				if ( !exists $primary{$docid} ) {
					print "\t\tsetting document for default primary thumbnail\n";
					$primary{$docid}=undef;
					sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
				}


				print "\t\tcreating original page jpeg $pageid.jpeg";
				system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
				system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
					$originaldir, $pageid,
					$originaldir, $pageid ));

				print "\t\tloading extracted words into database\n";
				create_page_words($pageid, $lang, $words);

				print "\t\tupdating page status to 'ok'\n";
				update_page_status($pageid, 'ok');
				print "\tdone\n";
			}
		}
		elsif ( $ext =~ /^(jpeg|png)$/ ) {
			print "\tdetecting image rotation\n";
			my %res;
			for(my $rot=0; $rot<360; $rot+=90) {
				print "\ttrying $rot degrees rotation\n";
				my $tempfile = "/tmp/autodoc.$$.jpeg";
				system(sprintf("convert %s/%s -rotate %s %s",
					$queuedir, $file, $rot, $tempfile));

				print "\t\trunning OCR\n";
				my $txt = ocr_file($tempfile);
				print "\t\tlanguage and dictionary detection\n";
				my($lang,$words, $dictmatches) = detect_lang($txt);

				print "\t\tfound $dictmatches words in dictionary\n";

				$res{$rot} = {
					lang => $lang,
					words => $words,
					dictmatches => $dictmatches
				};

				unlink($tempfile);
			}

			my $maxwords = 0;
			my $bestrot;
			foreach my $rot ( keys %res ) {
				$bestrot=$rot if !defined $bestrot;
				if ( $maxwords < $res{$rot}{dictmatches} ) {
					$maxwords = $res{$rot}{dictmatches};
					$bestrot = $rot;
				}
			}

			print "\tbest OCR results with $bestrot rotation\n";

			my $pageid = get_new_page($docid);
			print "\t\tcreated page id $pageid\n";

			print "\t\tupdating page status to 'inprogress'\n";
			update_page_status($pageid, 'inprogress');

			if ( !exists $primary{$docid} ) {
				print "\t\tsetting document for default primary thumbnail\n";
				$primary{$docid}=undef;
				sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
			}
			print "\t\tcreating original page jpeg $pageid.jpeg";
			system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));

			print "\t\tloading extracted words into database\n";
			create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
			print "\t\tupdating page status to 'ok'\n";
			update_page_status($pageid, 'ok');
			print "\tdone\n";
		}
		else {
			print "\terror: don't know how to process files of $ext type";
		}
	}
	else {
		print "\terror: file doesn't contain manadatory UUIDs in its name\n";
	}
	print "\tdeleting $file\n";
	unlink("$queuedir/$file");
}
closedir(Q);

print "done\n";

# open queue
	# decompose PDF
	# normalise all files as jpegs
	# generate page
	# ocr / lang detect
	# update db

sub ocr_file {
	my($file) = @_;
	my $txt = '';

	#open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
	open(OCR,sprintf("tesseract -l Latin %s - |", $file));
	while(<OCR>) {
		$txt .= $_;		
	}
	close(OCR);

	return $txt;
}
sub create_page_words {
	my($pageid, $lang, $words) = @_;

	foreach my $word ( @{$words} ) {
		sqlquery($dbh, "CALL create_page_word(?,?,?)",
			$pageid, $word, $langid{$lang});
	}

}

sub spell_check {
	my($txt, $lang) = @_;

	my $tmp = File::Temp->new();
	print $tmp $txt;

	open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
	while(<F>) {
		print $_;
	}
	close(F);
}


sub detect_lang {
	my($txt) = @_;

	my %lcnt;
	my @words;
	my $dictwords=0;

	foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
		next if length $word < 3;
		$word = lc($word);
		push @words, $word;

		my $lang;
		my $found=0;
		my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
		while(my ($l) = $q->fetchrow_array()) {
			$lcnt{$l}++;
			$found=1;
		}
		$dictwords++ if $found;
	}

	#print Dumper(\%lcnt);

	my $max = 0;
	my $lmax;
	foreach my $lang ( keys %lcnt ) {
		$lmax = $lang if !defined $lmax;
		if ( $lcnt{$lang} > $max ) {
			$lmax = $lang;
			$max = $lcnt{$lang};
		}
	}

	return ($lmax, \@words, $dictwords);
}

sub update_page_status {
	my($pageid, $status) = @_;

	sqlquery($dbh, "CALL set_page_status(?,?)",$pageid, $status);
}

sub get_new_page {
	my($docid) = @_;

	my $pageid;

	my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
	while(my($id)=$q->fetchrow_array()) {
		$pageid = $id;
	}

	return $pageid;
}

sub gen_uuid {
	my $ug = Data::UUID->new;
	return lc($ug->create_str());
}