An unfinished system to manage all your paper documentation in an easy way.
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

files_ingest.pl 6.1KB

5年前
5年前
5年前
5年前
5年前
5年前
5年前
5年前
5年前
5年前
5年前
5年前
5年前
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. #!/usr/bin/perl
  2. use strict;
  3. use lib '/opt/autodoc/lib';
  4. use Autodoc;
  5. use JSON;
  6. use DBI;
  7. use utf8;
  8. use GD::Simple;
  9. use Data::Dumper;
  10. use Data::UUID;
  11. use File::Temp;
  12. use warnings;
  13. $Data::Dumper::Sortkeys = 1;
  14. print "Loading configuration\n";
  15. my $conf = load_conf("../etc/autodoc.json");
  16. print "Connecting to database\n";
  17. my $dbh = sqlconnect($conf->{sql});
  18. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  19. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  20. my %primary;
  21. print "Loading languages\n";
  22. my %langid;
  23. my $q = sqlquery($dbh, "SELECT id,short FROM languages");
  24. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  25. print "Opening queue folder $queuedir\n";
  26. opendir(Q,$queuedir);
  27. foreach my $file ( readdir(Q) ) {
  28. next if $file =~ /^\./;
  29. print "processing file $file\n";
  30. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  31. my $docid = $1;
  32. my $ext = $3;
  33. print "\tdocument id $docid of type $ext\n";
  34. if ( $ext eq 'pdf' ) {
  35. my @pages;
  36. for(my $page=0;; $page++) {
  37. my $txt = '';
  38. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  39. while(<TXT>) {
  40. chomp;
  41. $txt .= ' ' . $_;
  42. }
  43. close(TXT);
  44. # end of PDF
  45. last if $?;
  46. print "\t\textracted text from PDF for page $page\n";
  47. my ($lang,$words) = detect_lang($txt);
  48. print "\t\tdetected language is $lang\n";
  49. my $pageid = get_new_page($docid);
  50. print "\t\tcreated page id $pageid\n";
  51. print "\t\tupdating page status to 'inprogress'\n";
  52. update_page_status($pageid, 'inprogress');
  53. if ( !exists $primary{$docid} ) {
  54. print "\t\tsetting document for default primary thumbnail\n";
  55. $primary{$docid}=undef;
  56. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  57. }
  58. print "\t\tcreating original page jpeg $pageid.jpeg";
  59. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  60. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  61. $originaldir, $pageid,
  62. $originaldir, $pageid ));
  63. print "\t\tloading extracted words into database\n";
  64. create_page_words($pageid, $lang, $words);
  65. print "\t\tupdating page status to 'ok'\n";
  66. update_page_status($pageid, 'ok');
  67. print "\tdone\n";
  68. }
  69. }
  70. elsif ( $ext =~ /^(jpeg|png)$/ ) {
  71. print "\tdetecting image rotation\n";
  72. my %res;
  73. for(my $rot=0; $rot<360; $rot+=90) {
  74. print "\ttrying $rot degrees rotation\n";
  75. my $tempfile = "/tmp/autodoc.$$.jpeg";
  76. system(sprintf("convert %s/%s -rotate %s %s",
  77. $queuedir, $file, $rot, $tempfile));
  78. print "\t\trunning OCR\n";
  79. my $txt = ocr_file($tempfile);
  80. print "\t\tlanguage and dictionary detection\n";
  81. my($lang,$words, $dictmatches) = detect_lang($txt);
  82. print "\t\tfound $dictmatches words in dictionary\n";
  83. $res{$rot} = {
  84. lang => $lang,
  85. words => $words,
  86. dictmatches => $dictmatches
  87. };
  88. unlink($tempfile);
  89. }
  90. my $maxwords = 0;
  91. my $bestrot;
  92. foreach my $rot ( keys %res ) {
  93. $bestrot=$rot if !defined $bestrot;
  94. if ( $maxwords < $res{$rot}{dictmatches} ) {
  95. $maxwords = $res{$rot}{dictmatches};
  96. $bestrot = $rot;
  97. }
  98. }
  99. print "\tbest OCR results with $bestrot rotation\n";
  100. my $pageid = get_new_page($docid);
  101. print "\t\tcreated page id $pageid\n";
  102. print "\t\tupdating page status to 'inprogress'\n";
  103. update_page_status($pageid, 'inprogress');
  104. if ( !exists $primary{$docid} ) {
  105. print "\t\tsetting document for default primary thumbnail\n";
  106. $primary{$docid}=undef;
  107. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  108. }
  109. print "\t\tcreating original page jpeg $pageid.jpeg";
  110. system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));
  111. print "\t\tloading extracted words into database\n";
  112. create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
  113. print "\t\tupdating page status to 'ok'\n";
  114. update_page_status($pageid, 'ok');
  115. print "\tdone\n";
  116. }
  117. else {
  118. print "\terror: don't know how to process files of $ext type";
  119. }
  120. }
  121. else {
  122. print "\terror: file doesn't contain manadatory UUIDs in its name\n";
  123. }
  124. print "\tdeleting $file\n";
  125. unlink("$queuedir/$file");
  126. }
  127. closedir(Q);
  128. print "done\n";
  129. # open queue
  130. # decompose PDF
  131. # normalise all files as jpegs
  132. # generate page
  133. # ocr / lang detect
  134. # update db
  135. sub ocr_file {
  136. my($file) = @_;
  137. my $txt = '';
  138. #open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
  139. open(OCR,sprintf("tesseract -l Latin %s - |", $file));
  140. while(<OCR>) {
  141. $txt .= $_;
  142. }
  143. close(OCR);
  144. return $txt;
  145. }
  146. sub create_page_words {
  147. my($pageid, $lang, $words) = @_;
  148. foreach my $word ( @{$words} ) {
  149. sqlquery($dbh, "CALL create_page_word(?,?,?)",
  150. $pageid, $word, $langid{$lang});
  151. }
  152. }
  153. sub spell_check {
  154. my($txt, $lang) = @_;
  155. my $tmp = File::Temp->new();
  156. print $tmp $txt;
  157. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  158. while(<F>) {
  159. print $_;
  160. }
  161. close(F);
  162. }
  163. sub detect_lang {
  164. my($txt) = @_;
  165. my %lcnt;
  166. my @words;
  167. my $dictwords=0;
  168. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  169. next if length $word < 3;
  170. $word = lc($word);
  171. push @words, $word;
  172. my $lang;
  173. my $found=0;
  174. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  175. while(my ($l) = $q->fetchrow_array()) {
  176. $lcnt{$l}++;
  177. $found=1;
  178. }
  179. $dictwords++ if $found;
  180. }
  181. #print Dumper(\%lcnt);
  182. my $max = 0;
  183. my $lmax;
  184. foreach my $lang ( keys %lcnt ) {
  185. $lmax = $lang if !defined $lmax;
  186. if ( $lcnt{$lang} > $max ) {
  187. $lmax = $lang;
  188. $max = $lcnt{$lang};
  189. }
  190. }
  191. return ($lmax, \@words, $dictwords);
  192. }
  193. sub update_page_status {
  194. my($pageid, $status) = @_;
  195. sqlquery($dbh, "CALL set_page_status(?,?)",$pageid, $status);
  196. }
  197. sub get_new_page {
  198. my($docid) = @_;
  199. my $pageid;
  200. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  201. while(my($id)=$q->fetchrow_array()) {
  202. $pageid = $id;
  203. }
  204. return $pageid;
  205. }
  206. sub gen_uuid {
  207. my $ug = Data::UUID->new;
  208. return lc($ug->create_str());
  209. }