An unfinished system to manage all your paper documentation in an easy way.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

files_ingest.pl 6.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. #!/usr/bin/perl
  2. use strict;
  3. use lib '/opt/autodoc/lib';
  4. use Autodoc;
  5. use JSON;
  6. use DBI;
  7. use utf8;
  8. use GD::Simple;
  9. use Data::Dumper;
  10. use Data::UUID;
  11. use File::Temp;
  12. use warnings;
  13. $Data::Dumper::Sortkeys = 1;
  14. print "Loading configuration\n";
  15. my $conf = load_conf("../etc/autodoc.json");
  16. print "Connecting to database\n";
  17. my $dbh = sqlconnect($conf->{sql});
  18. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  19. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  20. my %primary;
  21. print "Loading languages\n";
  22. my %langid;
  23. my $q = sqlquery($dbh, "SELECT id,short FROM languages");
  24. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  25. print "Opening queue folder $queuedir\n";
  26. opendir(Q,$queuedir);
  27. foreach my $file ( readdir(Q) ) {
  28. next if $file =~ /^\./;
  29. print "processing file $file\n";
  30. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  31. my $docid = $1;
  32. my $ext = $3;
  33. print "\tdocument id $docid of type $ext\n";
  34. if ( $ext eq 'pdf' ) {
  35. my @pages;
  36. for(my $page=0;; $page++) {
  37. my $txt = '';
  38. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  39. while(<TXT>) {
  40. chomp;
  41. $txt .= ' ' . $_;
  42. }
  43. close(TXT);
  44. # end of PDF
  45. last if $?;
  46. print "\t\textracted text from PDF for page $page\n";
  47. my ($lang,$words) = detect_lang($txt);
  48. print "\t\tdetected language is $lang\n";
  49. my $pageid = get_new_page($docid);
  50. print "\t\tcreated page id $pageid\n";
  51. print "\t\tupdating page status to 'inprogress'\n";
  52. update_page_status($pageid, 'inprogress');
  53. if ( !exists $primary{$docid} ) {
  54. print "\t\tsetting document for default primary thumbnail\n";
  55. $primary{$docid}=undef;
  56. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  57. }
  58. print "\t\tcreating original page jpeg $pageid.jpeg";
  59. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  60. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  61. $originaldir, $pageid,
  62. $originaldir, $pageid ));
  63. print "\t\tloading extracted words into database\n";
  64. create_page_words($pageid, $lang, $words);
  65. print "\t\tupdating page status to 'ok'\n";
  66. update_page_status($pageid, 'ok');
  67. print "\tdone\n";
  68. }
  69. }
  70. elsif ( $ext =~ /^(jpeg|png)$/ ) {
  71. print "\tdetecting image rotation\n";
  72. my %res;
  73. for(my $rot=0; $rot<360; $rot+=90) {
  74. print "\ttrying $rot degrees rotation\n";
  75. my $tempfile = "/tmp/autodoc.$$.jpeg";
  76. system(sprintf("convert %s/%s -rotate %s %s",
  77. $queuedir, $file, $rot, $tempfile));
  78. print "\t\trunning OCR\n";
  79. my $txt = ocr_file($tempfile);
  80. print "\t\tlanguage and dictionary detection\n";
  81. my($lang,$words, $dictmatches) = detect_lang($txt);
  82. print "\t\tfound $dictmatches words in dictionary\n";
  83. $res{$rot} = {
  84. lang => $lang,
  85. words => $words,
  86. dictmatches => $dictmatches
  87. };
  88. unlink($tempfile);
  89. }
  90. my $maxwords = 0;
  91. my $bestrot;
  92. foreach my $rot ( keys %res ) {
  93. $bestrot=$rot if !defined $bestrot;
  94. if ( $maxwords < $res{$rot}{dictmatches} ) {
  95. $maxwords = $res{$rot}{dictmatches};
  96. $bestrot = $rot;
  97. }
  98. }
  99. print "\tbest OCR results with $bestrot rotation\n";
  100. my $pageid = get_new_page($docid);
  101. print "\t\tcreated page id $pageid\n";
  102. print "\t\tupdating page status to 'inprogress'\n";
  103. update_page_status($pageid, 'inprogress');
  104. if ( !exists $primary{$docid} ) {
  105. print "\t\tsetting document for default primary thumbnail\n";
  106. $primary{$docid}=undef;
  107. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  108. }
  109. print "\t\tcreating original page jpeg $pageid.jpeg";
  110. system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));
  111. print "\t\tloading extracted words into database\n";
  112. create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
  113. print "\t\tupdating page status to 'ok'\n";
  114. update_page_status($pageid, 'ok');
  115. print "\tdone\n";
  116. }
  117. else {
  118. print "\terror: don't know how to process files of $ext type";
  119. }
  120. }
  121. else {
  122. print "\terror: file doesn't contain manadatory UUIDs in its name\n";
  123. }
  124. print "\tdeleting $file\n";
  125. unlink("$queuedir/$file");
  126. }
  127. closedir(Q);
  128. print "done\n";
  129. # open queue
  130. # decompose PDF
  131. # normalise all files as jpegs
  132. # generate page
  133. # ocr / lang detect
  134. # update db
  135. sub ocr_file {
  136. my($file) = @_;
  137. my $txt = '';
  138. #open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
  139. open(OCR,sprintf("tesseract -l Latin %s - |", $file));
  140. while(<OCR>) {
  141. $txt .= $_;
  142. }
  143. close(OCR);
  144. return $txt;
  145. }
  146. sub create_page_words {
  147. my($pageid, $lang, $words) = @_;
  148. foreach my $word ( @{$words} ) {
  149. sqlquery($dbh, "CALL create_page_word(?,?,?)",
  150. $pageid, $word, $langid{$lang});
  151. }
  152. }
  153. sub spell_check {
  154. my($txt, $lang) = @_;
  155. my $tmp = File::Temp->new();
  156. print $tmp $txt;
  157. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  158. while(<F>) {
  159. print $_;
  160. }
  161. close(F);
  162. }
  163. sub detect_lang {
  164. my($txt) = @_;
  165. my %lcnt;
  166. my @words;
  167. my $dictwords=0;
  168. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  169. next if length $word < 3;
  170. $word = lc($word);
  171. push @words, $word;
  172. my $lang;
  173. my $found=0;
  174. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  175. while(my ($l) = $q->fetchrow_array()) {
  176. $lcnt{$l}++;
  177. $found=1;
  178. }
  179. $dictwords++ if $found;
  180. }
  181. #print Dumper(\%lcnt);
  182. my $max = 0;
  183. my $lmax;
  184. foreach my $lang ( keys %lcnt ) {
  185. $lmax = $lang if !defined $lmax;
  186. if ( $lcnt{$lang} > $max ) {
  187. $lmax = $lang;
  188. $max = $lcnt{$lang};
  189. }
  190. }
  191. return ($lmax, \@words, $dictwords);
  192. }
  193. sub update_page_status {
  194. my($pageid, $status) = @_;
  195. sqlquery($dbh, "CALL set_page_status(?,?)",$pageid, $status);
  196. }
  197. sub get_new_page {
  198. my($docid) = @_;
  199. my $pageid;
  200. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  201. while(my($id)=$q->fetchrow_array()) {
  202. $pageid = $id;
  203. }
  204. return $pageid;
  205. }
  206. sub gen_uuid {
  207. my $ug = Data::UUID->new;
  208. return lc($ug->create_str());
  209. }