An unfinished system to manage all your paper documentation in an easy way.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

autodoc_process.pl 3.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use File::Temp;
  9. use warnings;
  10. $Data::Dumper::Sortkeys = 1;
  11. my $conf = load_conf("../etc/autodoc.json");
  12. my $dbh = sqlconnect($conf->{sql});
  13. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  14. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  15. my %langid;
  16. my $q = sqlquery($dbh, "SELECT id,short FROM lang");
  17. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  18. opendir(Q,$queuedir);
  19. foreach my $file ( readdir(Q) ) {
  20. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  21. my $docid = $1;
  22. my $ext = $3;
  23. print "Found document id $docid of type $ext\n";
  24. if ( $ext eq 'pdf' ) {
  25. my @pages;
  26. for(my $page=0;; $page++) {
  27. my $txt = '';
  28. print "texting page $page\n";
  29. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  30. while(<TXT>) {
  31. chomp;
  32. $txt .= ' ' . $_;
  33. }
  34. close(TXT);
  35. # end of PDF
  36. last if $?;
  37. my ($lang,$words) = detect_lang($txt);
  38. print "language is $lang\n";
  39. #spell_check($txt,$lang);
  40. my $pageid = get_new_page($docid);
  41. print "new page id $pageid\n";
  42. update_page_status($pageid, 'inprogress');
  43. print "create original page jpeg $pageid.jpeg";
  44. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  45. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  46. $originaldir, $pageid,
  47. $originaldir, $pageid ));
  48. create_page_words($pageid, $lang, $words);
  49. update_page_status($pageid, 'ok');
  50. }
  51. }
  52. else {
  53. }
  54. }
  55. }
  56. closedir(Q);
  57. # open queue
  58. # decompose PDF
  59. # normalise all files as jpegs
  60. # generate page
  61. # ocr / lang detect
  62. # update db
  63. sub create_page_words {
  64. my($pageid, $lang, $words) = @_;
  65. foreach my $word ( @{$words} ) {
  66. sqlquery($dbh, "CALL add_page_word(?,?,?)",
  67. $pageid, $word, $langid{$lang});
  68. }
  69. }
  70. sub spell_check {
  71. my($txt, $lang) = @_;
  72. my $tmp = File::Temp->new();
  73. print $tmp $txt;
  74. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  75. while(<F>) {
  76. print $_;
  77. }
  78. close(F);
  79. }
  80. sub detect_lang {
  81. my($txt) = @_;
  82. my %lcnt;
  83. my @words;
  84. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  85. next if length $word < 3;
  86. $word = lc($word);
  87. push @words, $word;
  88. my $lang;
  89. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  90. while(my ($l) = $q->fetchrow_array()) {
  91. $lcnt{$l}++;
  92. }
  93. }
  94. print Dumper(\%lcnt);
  95. my $max = 0;
  96. my $lmax;
  97. foreach my $lang ( keys %lcnt ) {
  98. $lmax = $lang if !defined $lmax;
  99. if ( $lcnt{$lang} > $max ) {
  100. $lmax = $lang;
  101. $max = $lcnt{$lang};
  102. }
  103. }
  104. return ($lmax, \@words);
  105. }
  106. sub update_page_status {
  107. my($pageid, $status) = @_;
  108. sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
  109. }
  110. sub get_new_page {
  111. my($docid) = @_;
  112. my $pageid;
  113. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  114. while(my($id)=$q->fetchrow_array()) {
  115. $pageid = $id;
  116. }
  117. return $pageid;
  118. }
  119. sub gen_uuid {
  120. my $ug = Data::UUID->new;
  121. return lc($ug->create_str());
  122. }
  123. sub load_conf {
  124. my($file) = @_;
  125. my $x='';
  126. open(F,"$file") || die "Failed to load configuration file";
  127. while(<F>) { $x.=$_; }
  128. close(F);
  129. return from_json($x);
  130. }
  131. sub sqlconnect {
  132. my($sql) = @_;
  133. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  134. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  135. die "Failed to connect to database";
  136. return $dbh;
  137. }
  138. sub sqlquery {
  139. my $dbh = shift;
  140. my $query = shift;
  141. my @args = @_;
  142. #print STDERR "$query\n";
  143. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  144. $sth->execute(@args) || die "Failed to execute SQL query";
  145. return $sth;
  146. }