An unfinished system to manage all your paper documentation in an easy way.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

autodoc_process.pl 6.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use utf8;
  6. use GD::Simple;
  7. use Data::Dumper;
  8. use Data::UUID;
  9. use File::Temp;
  10. use warnings;
  11. $Data::Dumper::Sortkeys = 1;
  12. print "Loading configuration\n";
  13. my $conf = load_conf("../etc/autodoc.json");
  14. print "Connecting to database\n";
  15. my $dbh = sqlconnect($conf->{sql});
  16. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  17. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  18. my %primary;
  19. print "Loading languages\n";
  20. my %langid;
  21. my $q = sqlquery($dbh, "SELECT id,short FROM lang");
  22. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  23. print "Opening queue folder $queuedir\n";
  24. opendir(Q,$queuedir);
  25. foreach my $file ( readdir(Q) ) {
  26. next if $file =~ /^\./;
  27. print "processing file $file\n";
  28. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  29. my $docid = $1;
  30. my $ext = $3;
  31. print "\tdocument id $docid of type $ext\n";
  32. if ( $ext eq 'pdf' ) {
  33. my @pages;
  34. for(my $page=0;; $page++) {
  35. my $txt = '';
  36. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  37. while(<TXT>) {
  38. chomp;
  39. $txt .= ' ' . $_;
  40. }
  41. close(TXT);
  42. # end of PDF
  43. last if $?;
  44. print "\t\textracted text from PDF for page $page\n";
  45. my ($lang,$words) = detect_lang($txt);
  46. print "\t\tdetected language is $lang\n";
  47. my $pageid = get_new_page($docid);
  48. print "\t\tcreated page id $pageid\n";
  49. print "\t\tupdating page status to 'inprogress'\n";
  50. update_page_status($pageid, 'inprogress');
  51. if ( !exists $primary{$docid} ) {
  52. print "\t\tsetting document for default primary thumbnail\n";
  53. $primary{$docid}=undef;
  54. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  55. }
  56. print "\t\tcreating original page jpeg $pageid.jpeg";
  57. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  58. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  59. $originaldir, $pageid,
  60. $originaldir, $pageid ));
  61. print "\t\tloading extracted words into database\n";
  62. create_page_words($pageid, $lang, $words);
  63. print "\t\tupdating page status to 'ok'\n";
  64. update_page_status($pageid, 'ok');
  65. print "\tdone\n";
  66. }
  67. }
  68. elsif ( $ext =~ /^(jpeg|png)$/ ) {
  69. print "\tdetecting image rotation\n";
  70. my %res;
  71. for(my $rot=0; $rot<360; $rot+=90) {
  72. print "\ttrying $rot degrees rotation\n";
  73. my $tempfile = "/tmp/autodoc.$$.jpeg";
  74. system(sprintf("convert %s/%s -rotate %s %s",
  75. $queuedir, $file, $rot, $tempfile));
  76. print "\t\trunning OCR\n";
  77. my $txt = ocr_file($tempfile);
  78. print "\t\tlanguage and dictionary detection\n";
  79. my($lang,$words, $dictmatches) = detect_lang($txt);
  80. print "\t\tfound $dictmatches words in dictionary\n";
  81. $res{$rot} = {
  82. lang => $lang,
  83. words => $words,
  84. dictmatches => $dictmatches
  85. };
  86. unlink($tempfile);
  87. }
  88. my $maxwords = 0;
  89. my $bestrot;
  90. foreach my $rot ( keys %res ) {
  91. $bestrot=$rot if !defined $bestrot;
  92. if ( $maxwords < $res{$rot}{dictmatches} ) {
  93. $maxwords = $res{$rot}{dictmatches};
  94. $bestrot = $rot;
  95. }
  96. }
  97. print "\tbest OCR results with $bestrot rotation\n";
  98. my $pageid = get_new_page($docid);
  99. print "\t\tcreated page id $pageid\n";
  100. print "\t\tupdating page status to 'inprogress'\n";
  101. update_page_status($pageid, 'inprogress');
  102. if ( !exists $primary{$docid} ) {
  103. print "\t\tsetting document for default primary thumbnail\n";
  104. $primary{$docid}=undef;
  105. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  106. }
  107. print "\t\tcreating original page jpeg $pageid.jpeg";
  108. system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));
  109. print "\t\tloading extracted words into database\n";
  110. create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
  111. print "\t\tupdating page status to 'ok'\n";
  112. update_page_status($pageid, 'ok');
  113. print "\tdone\n";
  114. }
  115. else {
  116. print "\terror: don't know how to process files of $ext type";
  117. }
  118. }
  119. else {
  120. print "\terror: file doesn't contain manadatory UUIDs in its name\n";
  121. }
  122. print "\tdeleting $file\n";
  123. unlink("$queuedir/$file");
  124. }
  125. closedir(Q);
  126. print "done\n";
  127. # open queue
  128. # decompose PDF
  129. # normalise all files as jpegs
  130. # generate page
  131. # ocr / lang detect
  132. # update db
  133. sub ocr_file {
  134. my($file) = @_;
  135. my $txt = '';
  136. open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
  137. while(<OCR>) {
  138. $txt .= $_;
  139. }
  140. close(OCR);
  141. return $txt;
  142. }
  143. sub create_page_words {
  144. my($pageid, $lang, $words) = @_;
  145. foreach my $word ( @{$words} ) {
  146. sqlquery($dbh, "CALL add_page_word(?,?,?)",
  147. $pageid, $word, $langid{$lang});
  148. }
  149. }
  150. sub spell_check {
  151. my($txt, $lang) = @_;
  152. my $tmp = File::Temp->new();
  153. print $tmp $txt;
  154. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  155. while(<F>) {
  156. print $_;
  157. }
  158. close(F);
  159. }
  160. sub detect_lang {
  161. my($txt) = @_;
  162. my %lcnt;
  163. my @words;
  164. my $dictwords=0;
  165. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  166. next if length $word < 3;
  167. $word = lc($word);
  168. push @words, $word;
  169. my $lang;
  170. my $found=0;
  171. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  172. while(my ($l) = $q->fetchrow_array()) {
  173. $lcnt{$l}++;
  174. $found=1;
  175. }
  176. $dictwords++ if $found;
  177. }
  178. #print Dumper(\%lcnt);
  179. my $max = 0;
  180. my $lmax;
  181. foreach my $lang ( keys %lcnt ) {
  182. $lmax = $lang if !defined $lmax;
  183. if ( $lcnt{$lang} > $max ) {
  184. $lmax = $lang;
  185. $max = $lcnt{$lang};
  186. }
  187. }
  188. return ($lmax, \@words, $dictwords);
  189. }
  190. sub update_page_status {
  191. my($pageid, $status) = @_;
  192. sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
  193. }
  194. sub get_new_page {
  195. my($docid) = @_;
  196. my $pageid;
  197. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  198. while(my($id)=$q->fetchrow_array()) {
  199. $pageid = $id;
  200. }
  201. return $pageid;
  202. }
  203. sub gen_uuid {
  204. my $ug = Data::UUID->new;
  205. return lc($ug->create_str());
  206. }
  207. sub load_conf {
  208. my($file) = @_;
  209. my $x='';
  210. open(F,"$file") || die "Failed to load configuration file";
  211. while(<F>) { $x.=$_; }
  212. close(F);
  213. return from_json($x);
  214. }
  215. sub sqlconnect {
  216. my($sql) = @_;
  217. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  218. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}, { mysql_enable_utf8 => 1 }) || \
  219. die "Failed to connect to database";
  220. return $dbh;
  221. }
  222. sub sqlquery {
  223. my $dbh = shift;
  224. my $query = shift;
  225. my @args = @_;
  226. #print STDERR "$query\n";
  227. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  228. $sth->execute(@args) || die "Failed to execute SQL query";
  229. return $sth;
  230. }