An unfinished system to manage all your paper documentation in an easy way.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

autodoc_process.pl 6.7KB

5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
5 vuotta sitten
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use utf8;
  6. use GD::Simple;
  7. use Data::Dumper;
  8. use Data::UUID;
  9. use File::Temp;
  10. use warnings;
  11. $Data::Dumper::Sortkeys = 1;
  12. print "Loading configuration\n";
  13. my $conf = load_conf("../etc/autodoc.json");
  14. print "Connecting to database\n";
  15. my $dbh = sqlconnect($conf->{sql});
  16. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  17. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  18. my %primary;
  19. print "Loading languages\n";
  20. my %langid;
  21. my $q = sqlquery($dbh, "SELECT id,short FROM lang");
  22. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  23. print "Opening queue folder $queuedir\n";
  24. opendir(Q,$queuedir);
  25. foreach my $file ( readdir(Q) ) {
  26. next if $file =~ /^\./;
  27. print "processing file $file\n";
  28. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  29. my $docid = $1;
  30. my $ext = $3;
  31. print "\tdocument id $docid of type $ext\n";
  32. if ( $ext eq 'pdf' ) {
  33. my @pages;
  34. for(my $page=0;; $page++) {
  35. my $txt = '';
  36. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  37. while(<TXT>) {
  38. chomp;
  39. $txt .= ' ' . $_;
  40. }
  41. close(TXT);
  42. # end of PDF
  43. last if $?;
  44. print "\t\textracted text from PDF for page $page\n";
  45. my ($lang,$words) = detect_lang($txt);
  46. print "\t\tdetected language is $lang\n";
  47. my $pageid = get_new_page($docid);
  48. print "\t\tcreated page id $pageid\n";
  49. print "\t\tupdating page status to 'inprogress'\n";
  50. update_page_status($pageid, 'inprogress');
  51. if ( !exists $primary{$docid} ) {
  52. print "\t\tsetting document for default primary thumbnail\n";
  53. $primary{$docid}=undef;
  54. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  55. }
  56. print "\t\tcreating original page jpeg $pageid.jpeg";
  57. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  58. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  59. $originaldir, $pageid,
  60. $originaldir, $pageid ));
  61. print "\t\tloading extracted words into database\n";
  62. create_page_words($pageid, $lang, $words);
  63. print "\t\tupdating page status to 'ok'\n";
  64. update_page_status($pageid, 'ok');
  65. print "\tdone\n";
  66. }
  67. }
  68. elsif ( $ext =~ /^(jpeg|png)$/ ) {
  69. print "\tdetecting image rotation\n";
  70. my %res;
  71. for(my $rot=0; $rot<360; $rot+=90) {
  72. print "\ttrying $rot degrees rotation\n";
  73. my $tempfile = "/tmp/autodoc.$$.jpeg";
  74. system(sprintf("convert %s/%s -rotate %s %s",
  75. $queuedir, $file, $rot, $tempfile));
  76. print "\t\trunning OCR\n";
  77. my $txt = ocr_file($tempfile);
  78. print "\t\tlanguage and dictionary detection\n";
  79. my($lang,$words, $dictmatches) = detect_lang($txt);
  80. print "\t\tfound $dictmatches words in dictionary\n";
  81. $res{$rot} = {
  82. lang => $lang,
  83. words => $words,
  84. dictmatches => $dictmatches
  85. };
  86. unlink($tempfile);
  87. }
  88. my $maxwords = 0;
  89. my $bestrot;
  90. foreach my $rot ( keys %res ) {
  91. $bestrot=$rot if !defined $bestrot;
  92. if ( $maxwords < $res{$rot}{dictmatches} ) {
  93. $maxwords = $res{$rot}{dictmatches};
  94. $bestrot = $rot;
  95. }
  96. }
  97. print "\tbest OCR results with $bestrot rotation\n";
  98. my $pageid = get_new_page($docid);
  99. print "\t\tcreated page id $pageid\n";
  100. print "\t\tupdating page status to 'inprogress'\n";
  101. update_page_status($pageid, 'inprogress');
  102. if ( !exists $primary{$docid} ) {
  103. print "\t\tsetting document for default primary thumbnail\n";
  104. $primary{$docid}=undef;
  105. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  106. }
  107. print "\t\tcreating original page jpeg $pageid.jpeg";
  108. system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));
  109. print "\t\tloading extracted words into database\n";
  110. create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
  111. print "\t\tupdating page status to 'ok'\n";
  112. update_page_status($pageid, 'ok');
  113. print "\tdone\n";
  114. }
  115. else {
  116. print "\terror: don't know how to process files of $ext type";
  117. }
  118. }
  119. else {
  120. print "\terror: file doesn't contain manadatory UUIDs in its name\n";
  121. }
  122. print "\tdeleting $file\n";
  123. unlink("$queuedir/$file");
  124. }
  125. closedir(Q);
  126. print "done\n";
  127. # open queue
  128. # decompose PDF
  129. # normalise all files as jpegs
  130. # generate page
  131. # ocr / lang detect
  132. # update db
  133. sub ocr_file {
  134. my($file) = @_;
  135. my $txt = '';
  136. open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
  137. while(<OCR>) {
  138. $txt .= $_;
  139. }
  140. close(OCR);
  141. return $txt;
  142. }
  143. sub create_page_words {
  144. my($pageid, $lang, $words) = @_;
  145. foreach my $word ( @{$words} ) {
  146. sqlquery($dbh, "CALL add_page_word(?,?,?)",
  147. $pageid, $word, $langid{$lang});
  148. }
  149. }
  150. sub spell_check {
  151. my($txt, $lang) = @_;
  152. my $tmp = File::Temp->new();
  153. print $tmp $txt;
  154. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  155. while(<F>) {
  156. print $_;
  157. }
  158. close(F);
  159. }
  160. sub detect_lang {
  161. my($txt) = @_;
  162. my %lcnt;
  163. my @words;
  164. my $dictwords=0;
  165. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  166. next if length $word < 3;
  167. $word = lc($word);
  168. push @words, $word;
  169. my $lang;
  170. my $found=0;
  171. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  172. while(my ($l) = $q->fetchrow_array()) {
  173. $lcnt{$l}++;
  174. $found=1;
  175. }
  176. $dictwords++ if $found;
  177. }
  178. #print Dumper(\%lcnt);
  179. my $max = 0;
  180. my $lmax;
  181. foreach my $lang ( keys %lcnt ) {
  182. $lmax = $lang if !defined $lmax;
  183. if ( $lcnt{$lang} > $max ) {
  184. $lmax = $lang;
  185. $max = $lcnt{$lang};
  186. }
  187. }
  188. return ($lmax, \@words, $dictwords);
  189. }
  190. sub update_page_status {
  191. my($pageid, $status) = @_;
  192. sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
  193. }
  194. sub get_new_page {
  195. my($docid) = @_;
  196. my $pageid;
  197. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  198. while(my($id)=$q->fetchrow_array()) {
  199. $pageid = $id;
  200. }
  201. return $pageid;
  202. }
  203. sub gen_uuid {
  204. my $ug = Data::UUID->new;
  205. return lc($ug->create_str());
  206. }
  207. sub load_conf {
  208. my($file) = @_;
  209. my $x='';
  210. open(F,"$file") || die "Failed to load configuration file";
  211. while(<F>) { $x.=$_; }
  212. close(F);
  213. return from_json($x);
  214. }
  215. sub sqlconnect {
  216. my($sql) = @_;
  217. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  218. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}, { mysql_enable_utf8 => 1 }) || \
  219. die "Failed to connect to database";
  220. return $dbh;
  221. }
  222. sub sqlquery {
  223. my $dbh = shift;
  224. my $query = shift;
  225. my @args = @_;
  226. #print STDERR "$query\n";
  227. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  228. $sth->execute(@args) || die "Failed to execute SQL query";
  229. return $sth;
  230. }