An unfinished system to manage all your paper documentation in an easy way.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

autodoc_process.pl 6.6KB

5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
5 년 전
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use File::Temp;
  9. use warnings;
  10. $Data::Dumper::Sortkeys = 1;
  11. print "Loading configuration\n";
  12. my $conf = load_conf("../etc/autodoc.json");
  13. print "Connecting to database\n";
  14. my $dbh = sqlconnect($conf->{sql});
  15. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  16. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  17. my %primary;
  18. print "Loading languages\n";
  19. my %langid;
  20. my $q = sqlquery($dbh, "SELECT id,short FROM lang");
  21. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  22. print "Opening queue folder $queuedir\n";
  23. opendir(Q,$queuedir);
  24. foreach my $file ( readdir(Q) ) {
  25. next if $file =~ /^\./;
  26. print "processing file $file\n";
  27. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  28. my $docid = $1;
  29. my $ext = $3;
  30. print "\tdocument id $docid of type $ext\n";
  31. if ( $ext eq 'pdf' ) {
  32. my @pages;
  33. for(my $page=0;; $page++) {
  34. my $txt = '';
  35. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  36. while(<TXT>) {
  37. chomp;
  38. $txt .= ' ' . $_;
  39. }
  40. close(TXT);
  41. # end of PDF
  42. last if $?;
  43. print "\t\textracted text from PDF for page $page\n";
  44. my ($lang,$words) = detect_lang($txt);
  45. print "\t\tdetected language is $lang\n";
  46. my $pageid = get_new_page($docid);
  47. print "\t\tcreated page id $pageid\n";
  48. print "\t\tupdating page status to 'inprogress'\n";
  49. update_page_status($pageid, 'inprogress');
  50. if ( !exists $primary{$docid} ) {
  51. print "\t\tsetting document for default primary thumbnail\n";
  52. $primary{$docid}=undef;
  53. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  54. }
  55. print "\t\tcreating original page jpeg $pageid.jpeg";
  56. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  57. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  58. $originaldir, $pageid,
  59. $originaldir, $pageid ));
  60. print "\t\tloading extracted words into database\n";
  61. create_page_words($pageid, $lang, $words);
  62. print "\t\tupdating page status to 'ok'\n";
  63. update_page_status($pageid, 'ok');
  64. print "\tdone\n";
  65. }
  66. }
  67. elsif ( $ext =~ /^(jpeg|png)$/ ) {
  68. print "\tdetecting image rotation\n";
  69. my %res;
  70. for(my $rot=0; $rot<360; $rot+=90) {
  71. print "\ttrying $rot degrees rotation\n";
  72. my $tempfile = "/tmp/autodoc.$$.jpeg";
  73. system(sprintf("convert %s/%s -rotate %s %s",
  74. $queuedir, $file, $rot, $tempfile));
  75. print "\t\trunning OCR\n";
  76. my $txt = ocr_file($tempfile);
  77. print "\t\tlanguage and dictionary detection\n";
  78. my($lang,$words, $dictmatches) = detect_lang($txt);
  79. print "\t\tfound $dictmatches words in dictionary\n";
  80. $res{$rot} = {
  81. lang => $lang,
  82. words => $words,
  83. dictmatches => $dictmatches
  84. };
  85. unlink($tempfile);
  86. }
  87. my $maxwords = 0;
  88. my $bestrot;
  89. foreach my $rot ( keys %res ) {
  90. $bestrot=$rot if !defined $bestrot;
  91. if ( $maxwords < $res{$rot}{dictmatches} ) {
  92. $maxwords = $res{$rot}{dictmatches};
  93. $bestrot = $rot;
  94. }
  95. }
  96. print "\tbest OCR results with $bestrot rotation\n";
  97. my $pageid = get_new_page($docid);
  98. print "\t\tcreated page id $pageid\n";
  99. print "\t\tupdating page status to 'inprogress'\n";
  100. update_page_status($pageid, 'inprogress');
  101. if ( !exists $primary{$docid} ) {
  102. print "\t\tsetting document for default primary thumbnail\n";
  103. $primary{$docid}=undef;
  104. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  105. }
  106. print "\t\tcreating original page jpeg $pageid.jpeg";
  107. system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));
  108. print "\t\tloading extracted words into database\n";
  109. create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
  110. print "\t\tupdating page status to 'ok'\n";
  111. update_page_status($pageid, 'ok');
  112. print "\tdone\n";
  113. }
  114. else {
  115. print "\terror: don't know how to process files of $ext type";
  116. }
  117. }
  118. else {
  119. print "\terror: file doesn't contain manadatory UUIDs in its name\n";
  120. }
  121. print "\tdeleting $file\n";
  122. unlink("$queuedir/$file");
  123. }
  124. closedir(Q);
  125. print "done\n";
  126. # open queue
  127. # decompose PDF
  128. # normalise all files as jpegs
  129. # generate page
  130. # ocr / lang detect
  131. # update db
  132. sub ocr_file {
  133. my($file) = @_;
  134. my $txt = '';
  135. open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
  136. while(<OCR>) {
  137. $txt .= $_;
  138. }
  139. close(OCR);
  140. return $txt;
  141. }
  142. sub create_page_words {
  143. my($pageid, $lang, $words) = @_;
  144. foreach my $word ( @{$words} ) {
  145. sqlquery($dbh, "CALL add_page_word(?,?,?)",
  146. $pageid, $word, $langid{$lang});
  147. }
  148. }
  149. sub spell_check {
  150. my($txt, $lang) = @_;
  151. my $tmp = File::Temp->new();
  152. print $tmp $txt;
  153. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  154. while(<F>) {
  155. print $_;
  156. }
  157. close(F);
  158. }
  159. sub detect_lang {
  160. my($txt) = @_;
  161. my %lcnt;
  162. my @words;
  163. my $dictwords=0;
  164. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  165. next if length $word < 3;
  166. $word = lc($word);
  167. push @words, $word;
  168. my $lang;
  169. my $found=0;
  170. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  171. while(my ($l) = $q->fetchrow_array()) {
  172. $lcnt{$l}++;
  173. $found=1;
  174. }
  175. $dictwords++ if $found;
  176. }
  177. #print Dumper(\%lcnt);
  178. my $max = 0;
  179. my $lmax;
  180. foreach my $lang ( keys %lcnt ) {
  181. $lmax = $lang if !defined $lmax;
  182. if ( $lcnt{$lang} > $max ) {
  183. $lmax = $lang;
  184. $max = $lcnt{$lang};
  185. }
  186. }
  187. return ($lmax, \@words, $dictwords);
  188. }
  189. sub update_page_status {
  190. my($pageid, $status) = @_;
  191. sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
  192. }
  193. sub get_new_page {
  194. my($docid) = @_;
  195. my $pageid;
  196. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  197. while(my($id)=$q->fetchrow_array()) {
  198. $pageid = $id;
  199. }
  200. return $pageid;
  201. }
  202. sub gen_uuid {
  203. my $ug = Data::UUID->new;
  204. return lc($ug->create_str());
  205. }
  206. sub load_conf {
  207. my($file) = @_;
  208. my $x='';
  209. open(F,"$file") || die "Failed to load configuration file";
  210. while(<F>) { $x.=$_; }
  211. close(F);
  212. return from_json($x);
  213. }
  214. sub sqlconnect {
  215. my($sql) = @_;
  216. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  217. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  218. die "Failed to connect to database";
  219. return $dbh;
  220. }
  221. sub sqlquery {
  222. my $dbh = shift;
  223. my $query = shift;
  224. my @args = @_;
  225. #print STDERR "$query\n";
  226. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  227. $sth->execute(@args) || die "Failed to execute SQL query";
  228. return $sth;
  229. }