An unfinished system to manage all your paper documentation in an easy way.
Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

autodoc_process.pl 6.6KB

pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
pirms 5 gadiem
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use File::Temp;
  9. use warnings;
  10. $Data::Dumper::Sortkeys = 1;
  11. print "Loading configuration\n";
  12. my $conf = load_conf("../etc/autodoc.json");
  13. print "Connecting to database\n";
  14. my $dbh = sqlconnect($conf->{sql});
  15. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  16. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  17. my %primary;
  18. print "Loading languages\n";
  19. my %langid;
  20. my $q = sqlquery($dbh, "SELECT id,short FROM lang");
  21. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  22. print "Opening queue folder $queuedir\n";
  23. opendir(Q,$queuedir);
  24. foreach my $file ( readdir(Q) ) {
  25. next if $file =~ /^\./;
  26. print "processing file $file\n";
  27. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  28. my $docid = $1;
  29. my $ext = $3;
  30. print "\tdocument id $docid of type $ext\n";
  31. if ( $ext eq 'pdf' ) {
  32. my @pages;
  33. for(my $page=0;; $page++) {
  34. my $txt = '';
  35. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  36. while(<TXT>) {
  37. chomp;
  38. $txt .= ' ' . $_;
  39. }
  40. close(TXT);
  41. # end of PDF
  42. last if $?;
  43. print "\t\textracted text from PDF for page $page\n";
  44. my ($lang,$words) = detect_lang($txt);
  45. print "\t\tdetected language is $lang\n";
  46. my $pageid = get_new_page($docid);
  47. print "\t\tcreated page id $pageid\n";
  48. print "\t\tupdating page status to 'inprogress'\n";
  49. update_page_status($pageid, 'inprogress');
  50. if ( !exists $primary{$docid} ) {
  51. print "\t\tsetting document for default primary thumbnail\n";
  52. $primary{$docid}=undef;
  53. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  54. }
  55. print "\t\tcreating original page jpeg $pageid.jpeg";
  56. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  57. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  58. $originaldir, $pageid,
  59. $originaldir, $pageid ));
  60. print "\t\tloading extracted words into database\n";
  61. create_page_words($pageid, $lang, $words);
  62. print "\t\tupdating page status to 'ok'\n";
  63. update_page_status($pageid, 'ok');
  64. print "\tdone\n";
  65. }
  66. }
  67. elsif ( $ext =~ /^(jpeg|png)$/ ) {
  68. print "\tdetecting image rotation\n";
  69. my %res;
  70. for(my $rot=0; $rot<360; $rot+=90) {
  71. print "\ttrying $rot degrees rotation\n";
  72. my $tempfile = "/tmp/autodoc.$$.jpeg";
  73. system(sprintf("convert %s/%s -rotate %s %s",
  74. $queuedir, $file, $rot, $tempfile));
  75. print "\t\trunning OCR\n";
  76. my $txt = ocr_file($tempfile);
  77. print "\t\tlanguage and dictionary detection\n";
  78. my($lang,$words, $dictmatches) = detect_lang($txt);
  79. print "\t\tfound $dictmatches words in dictionary\n";
  80. $res{$rot} = {
  81. lang => $lang,
  82. words => $words,
  83. dictmatches => $dictmatches
  84. };
  85. unlink($tempfile);
  86. }
  87. my $maxwords = 0;
  88. my $bestrot;
  89. foreach my $rot ( keys %res ) {
  90. $bestrot=$rot if !defined $bestrot;
  91. if ( $maxwords < $res{$rot}{dictmatches} ) {
  92. $maxwords = $res{$rot}{dictmatches};
  93. $bestrot = $rot;
  94. }
  95. }
  96. print "\tbest OCR results with $bestrot rotation\n";
  97. my $pageid = get_new_page($docid);
  98. print "\t\tcreated page id $pageid\n";
  99. print "\t\tupdating page status to 'inprogress'\n";
  100. update_page_status($pageid, 'inprogress');
  101. if ( !exists $primary{$docid} ) {
  102. print "\t\tsetting document for default primary thumbnail\n";
  103. $primary{$docid}=undef;
  104. sqlquery($dbh, "CALL set_primary_page(?)",$pageid);
  105. }
  106. print "\t\tcreating original page jpeg $pageid.jpeg";
  107. system(sprintf("convert %s/%s %s/%s.jpeg", $queuedir, $file, $originaldir, $pageid));
  108. print "\t\tloading extracted words into database\n";
  109. create_page_words($pageid, $res{$bestrot}{lang}, $res{$bestrot}{words});
  110. print "\t\tupdating page status to 'ok'\n";
  111. update_page_status($pageid, 'ok');
  112. print "\tdone\n";
  113. }
  114. else {
  115. print "\terror: don't know how to process files of $ext type";
  116. }
  117. }
  118. else {
  119. print "\terror: file doesn't contain manadatory UUIDs in its name\n";
  120. }
  121. }
  122. closedir(Q);
  123. print "done\n";
  124. # open queue
  125. # decompose PDF
  126. # normalise all files as jpegs
  127. # generate page
  128. # ocr / lang detect
  129. # update db
  130. sub ocr_file {
  131. my($file) = @_;
  132. my $txt = '';
  133. open(OCR,sprintf("tesseract -l eng+deu+fra+ita %s - |", $file));
  134. while(<OCR>) {
  135. $txt .= $_;
  136. }
  137. close(OCR);
  138. return $txt;
  139. }
  140. sub create_page_words {
  141. my($pageid, $lang, $words) = @_;
  142. foreach my $word ( @{$words} ) {
  143. sqlquery($dbh, "CALL add_page_word(?,?,?)",
  144. $pageid, $word, $langid{$lang});
  145. }
  146. }
  147. sub spell_check {
  148. my($txt, $lang) = @_;
  149. my $tmp = File::Temp->new();
  150. print $tmp $txt;
  151. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  152. while(<F>) {
  153. print $_;
  154. }
  155. close(F);
  156. }
  157. sub detect_lang {
  158. my($txt) = @_;
  159. my %lcnt;
  160. my @words;
  161. my $dictwords=0;
  162. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  163. next if length $word < 3;
  164. $word = lc($word);
  165. push @words, $word;
  166. my $lang;
  167. my $found=0;
  168. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  169. while(my ($l) = $q->fetchrow_array()) {
  170. $lcnt{$l}++;
  171. $found=1;
  172. }
  173. $dictwords++ if $found;
  174. }
  175. #print Dumper(\%lcnt);
  176. my $max = 0;
  177. my $lmax;
  178. foreach my $lang ( keys %lcnt ) {
  179. $lmax = $lang if !defined $lmax;
  180. if ( $lcnt{$lang} > $max ) {
  181. $lmax = $lang;
  182. $max = $lcnt{$lang};
  183. }
  184. }
  185. return ($lmax, \@words, $dictwords);
  186. }
  187. sub update_page_status {
  188. my($pageid, $status) = @_;
  189. sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
  190. }
  191. sub get_new_page {
  192. my($docid) = @_;
  193. my $pageid;
  194. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  195. while(my($id)=$q->fetchrow_array()) {
  196. $pageid = $id;
  197. }
  198. return $pageid;
  199. }
  200. sub gen_uuid {
  201. my $ug = Data::UUID->new;
  202. return lc($ug->create_str());
  203. }
  204. sub load_conf {
  205. my($file) = @_;
  206. my $x='';
  207. open(F,"$file") || die "Failed to load configuration file";
  208. while(<F>) { $x.=$_; }
  209. close(F);
  210. return from_json($x);
  211. }
  212. sub sqlconnect {
  213. my($sql) = @_;
  214. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  215. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  216. die "Failed to connect to database";
  217. return $dbh;
  218. }
  219. sub sqlquery {
  220. my $dbh = shift;
  221. my $query = shift;
  222. my @args = @_;
  223. #print STDERR "$query\n";
  224. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  225. $sth->execute(@args) || die "Failed to execute SQL query";
  226. return $sth;
  227. }