5 年前 · badf8640d3
--- a/bin/autodoc_process.pl
+++ b/bin/autodoc_process.pl
@@ -11,30 +11,36 @@ use warnings;

 $Data::Dumper::Sortkeys = 1;

 print "Loading configuration\n";
 my $conf = load_conf("../etc/autodoc.json");

 print "Connecting to database\n";
 my $dbh  = sqlconnect($conf->{sql});

 my $queuedir    = $conf->{path}{global}.'/'.$conf->{path}{queue};
 my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};

 my %langid;
 my %primary;

 print "Loading languages\n";
 my %langid;
 my $q = sqlquery($dbh, "SELECT id,short FROM lang");
 while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }

 print "Opening queue folder $queuedir\n";
 opendir(Q,$queuedir);
 foreach my $file ( readdir(Q) ) {
 	print "processing file $file\n";
 	if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
 		my $docid = $1;
 		my $ext = $3;

 		print "Found document id $docid of type $ext\n";
 		print "\tdocument id $docid of type $ext\n";

 		if ( $ext eq 'pdf' ) {
 			my @pages;
 			for(my $page=0;; $page++) {
 				my $txt = '';
 				print "texting page $page\n";
 				open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
 				while(<TXT>) {
 					chomp;
@@ -45,37 +51,66 @@ foreach my $file ( readdir(Q) ) {
 				# end of PDF
 				last if $?;

 				print "\t\textracted text from PDF for page $page\n";


 				my ($lang,$words) = detect_lang($txt);
 				print "language is $lang\n";
 				#spell_check($txt,$lang);
 				print "\t\tdetected language is $lang\n";

 				my $pageid = get_new_page($docid);
 				print "new page id $pageid\n";
 				print "\t\tcreated page id $pageid\n";
 				print "\t\tupdating page status to 'inprogress'\n";
 				update_page_status($pageid, 'inprogress');

 				if ( !exists $primary{$docid} ) {
 					print "\t\tsetting document for default primary thumbnail\n";
 					$primary{$docid}=undef;
 					sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
 				}


 				print "create original page jpeg $pageid.jpeg";
 				print "\t\tcreating original page jpeg $pageid.jpeg";
 				system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
 				system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
 					$originaldir, $pageid,
 					$originaldir, $pageid ));

 				print "\t\tloading extracted words into database\n";
 				create_page_words($pageid, $lang, $words);

 				print "\t\tupdating page status to 'ok'\n";
 				update_page_status($pageid, 'ok');

 			}
 		}
 		elsif ( $ext =~ /^(jpeg|png)$/ ) {
 			print "\tdetecting image rotation\n";
 			for(my $rot=0; $rot<=360; $rot+=90) {
 				print "\t\ttrying $rot degrees rotation\n";
 				my $tempfile = "/tmp/autodoc.$$.jpeg";
 				system(sprintf("convert %s/%s -rotate %s %s",
 					$queuedir, $file, $rot, $tempfile));

 				my($lang,$words) = detect_lang(ocr_file($tempfile));

 				print Dumper($lang, $words);

 				unlink($tempfile);
 			}
 			print "\trunning OCR on page\n";
 		}
 		else {
 			print "\terror: don't know how to process files of $ext type";
 		}
 	}
 	else {
 		print "\terror: file doesn't contain manadatory UUIDs in its name\n";
 	}
 }
 closedir(Q);

 print "done\n";

 # open queue
 	# decompose PDF
 	# normalise all files as jpegs
@@ -83,6 +118,18 @@ closedir(Q);
 	# ocr / lang detect
 	# update db

 sub ocr_file {
 	my($file) = @_;
 	my $txt = '';

 	open(OCR,"tesseract -l eng+deu+fra+ita %s - |", $file));
 	while(<OCR>) {
 		$txt .= $_;		
 	}
 	close(OCR);
 	
 	return $txt;
 }
 sub create_page_words {
 	my($pageid, $lang, $words) = @_;