An unfinished system to manage all your paper documentation in an easy way.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

autodoc_process.pl 5.2KB

5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
5 anos atrás
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use File::Temp;
  9. use warnings;
  10. $Data::Dumper::Sortkeys = 1;
  11. print "Loading configuration\n";
  12. my $conf = load_conf("../etc/autodoc.json");
  13. print "Connecting to database\n";
  14. my $dbh = sqlconnect($conf->{sql});
  15. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  16. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  17. my %primary;
  18. print "Loading languages\n";
  19. my %langid;
  20. my $q = sqlquery($dbh, "SELECT id,short FROM lang");
  21. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  22. print "Opening queue folder $queuedir\n";
  23. opendir(Q,$queuedir);
  24. foreach my $file ( readdir(Q) ) {
  25. print "processing file $file\n";
  26. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  27. my $docid = $1;
  28. my $ext = $3;
  29. print "\tdocument id $docid of type $ext\n";
  30. if ( $ext eq 'pdf' ) {
  31. my @pages;
  32. for(my $page=0;; $page++) {
  33. my $txt = '';
  34. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  35. while(<TXT>) {
  36. chomp;
  37. $txt .= ' ' . $_;
  38. }
  39. close(TXT);
  40. # end of PDF
  41. last if $?;
  42. print "\t\textracted text from PDF for page $page\n";
  43. my ($lang,$words) = detect_lang($txt);
  44. print "\t\tdetected language is $lang\n";
  45. my $pageid = get_new_page($docid);
  46. print "\t\tcreated page id $pageid\n";
  47. print "\t\tupdating page status to 'inprogress'\n";
  48. update_page_status($pageid, 'inprogress');
  49. if ( !exists $primary{$docid} ) {
  50. print "\t\tsetting document for default primary thumbnail\n";
  51. $primary{$docid}=undef;
  52. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  53. }
  54. print "\t\tcreating original page jpeg $pageid.jpeg";
  55. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  56. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  57. $originaldir, $pageid,
  58. $originaldir, $pageid ));
  59. print "\t\tloading extracted words into database\n";
  60. create_page_words($pageid, $lang, $words);
  61. print "\t\tupdating page status to 'ok'\n";
  62. update_page_status($pageid, 'ok');
  63. }
  64. }
  65. elsif ( $ext =~ /^(jpeg|png)$/ ) {
  66. print "\tdetecting image rotation\n";
  67. for(my $rot=0; $rot<=360; $rot+=90) {
  68. print "\t\ttrying $rot degrees rotation\n";
  69. my $tempfile = "/tmp/autodoc.$$.jpeg";
  70. system(sprintf("convert %s/%s -rotate %s %s",
  71. $queuedir, $file, $rot, $tempfile));
  72. my($lang,$words) = detect_lang(ocr_file($tempfile));
  73. print Dumper($lang, $words);
  74. unlink($tempfile);
  75. }
  76. print "\trunning OCR on page\n";
  77. }
  78. else {
  79. print "\terror: don't know how to process files of $ext type";
  80. }
  81. }
  82. else {
  83. print "\terror: file doesn't contain manadatory UUIDs in its name\n";
  84. }
  85. }
  86. closedir(Q);
  87. print "done\n";
  88. # open queue
  89. # decompose PDF
  90. # normalise all files as jpegs
  91. # generate page
  92. # ocr / lang detect
  93. # update db
  94. sub ocr_file {
  95. my($file) = @_;
  96. my $txt = '';
  97. open(OCR,"tesseract -l eng+deu+fra+ita %s - |", $file));
  98. while(<OCR>) {
  99. $txt .= $_;
  100. }
  101. close(OCR);
  102. return $txt;
  103. }
  104. sub create_page_words {
  105. my($pageid, $lang, $words) = @_;
  106. foreach my $word ( @{$words} ) {
  107. sqlquery($dbh, "CALL add_page_word(?,?,?)",
  108. $pageid, $word, $langid{$lang});
  109. }
  110. }
  111. sub spell_check {
  112. my($txt, $lang) = @_;
  113. my $tmp = File::Temp->new();
  114. print $tmp $txt;
  115. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  116. while(<F>) {
  117. print $_;
  118. }
  119. close(F);
  120. }
  121. sub detect_lang {
  122. my($txt) = @_;
  123. my %lcnt;
  124. my @words;
  125. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  126. next if length $word < 3;
  127. $word = lc($word);
  128. push @words, $word;
  129. my $lang;
  130. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  131. while(my ($l) = $q->fetchrow_array()) {
  132. $lcnt{$l}++;
  133. }
  134. }
  135. print Dumper(\%lcnt);
  136. my $max = 0;
  137. my $lmax;
  138. foreach my $lang ( keys %lcnt ) {
  139. $lmax = $lang if !defined $lmax;
  140. if ( $lcnt{$lang} > $max ) {
  141. $lmax = $lang;
  142. $max = $lcnt{$lang};
  143. }
  144. }
  145. return ($lmax, \@words);
  146. }
  147. sub update_page_status {
  148. my($pageid, $status) = @_;
  149. sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
  150. }
  151. sub get_new_page {
  152. my($docid) = @_;
  153. my $pageid;
  154. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  155. while(my($id)=$q->fetchrow_array()) {
  156. $pageid = $id;
  157. }
  158. return $pageid;
  159. }
  160. sub gen_uuid {
  161. my $ug = Data::UUID->new;
  162. return lc($ug->create_str());
  163. }
  164. sub load_conf {
  165. my($file) = @_;
  166. my $x='';
  167. open(F,"$file") || die "Failed to load configuration file";
  168. while(<F>) { $x.=$_; }
  169. close(F);
  170. return from_json($x);
  171. }
  172. sub sqlconnect {
  173. my($sql) = @_;
  174. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  175. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  176. die "Failed to connect to database";
  177. return $dbh;
  178. }
  179. sub sqlquery {
  180. my $dbh = shift;
  181. my $query = shift;
  182. my @args = @_;
  183. #print STDERR "$query\n";
  184. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  185. $sth->execute(@args) || die "Failed to execute SQL query";
  186. return $sth;
  187. }