An unfinished system to manage all your paper documentation in an easy way.
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

autodoc_process.pl 3.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use File::Temp;
  9. use warnings;
  10. $Data::Dumper::Sortkeys = 1;
  11. my $conf = load_conf("../etc/autodoc.json");
  12. my $dbh = sqlconnect($conf->{sql});
  13. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  14. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  15. my %langid;
  16. my $q = sqlquery($dbh, "SELECT id,short FROM lang");
  17. while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
  18. opendir(Q,$queuedir);
  19. foreach my $file ( readdir(Q) ) {
  20. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  21. my $docid = $1;
  22. my $ext = $3;
  23. print "Found document id $docid of type $ext\n";
  24. if ( $ext eq 'pdf' ) {
  25. my @pages;
  26. for(my $page=0;; $page++) {
  27. my $txt = '';
  28. print "texting page $page\n";
  29. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  30. while(<TXT>) {
  31. chomp;
  32. $txt .= ' ' . $_;
  33. }
  34. close(TXT);
  35. # end of PDF
  36. last if $?;
  37. my ($lang,$words) = detect_lang($txt);
  38. print "language is $lang\n";
  39. #spell_check($txt,$lang);
  40. my $pageid = get_new_page($docid);
  41. print "new page id $pageid\n";
  42. update_page_status($pageid, 'inprogress');
  43. print "create original page jpeg $pageid.jpeg";
  44. system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  45. system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
  46. $originaldir, $pageid,
  47. $originaldir, $pageid ));
  48. create_page_words($pageid, $lang, $words);
  49. update_page_status($pageid, 'ok');
  50. }
  51. }
  52. else {
  53. }
  54. }
  55. }
  56. closedir(Q);
  57. # open queue
  58. # decompose PDF
  59. # normalise all files as jpegs
  60. # generate page
  61. # ocr / lang detect
  62. # update db
  63. sub create_page_words {
  64. my($pageid, $lang, $words) = @_;
  65. foreach my $word ( @{$words} ) {
  66. sqlquery($dbh, "CALL add_page_word(?,?,?)",
  67. $pageid, $word, $langid{$lang});
  68. }
  69. }
  70. sub spell_check {
  71. my($txt, $lang) = @_;
  72. my $tmp = File::Temp->new();
  73. print $tmp $txt;
  74. open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
  75. while(<F>) {
  76. print $_;
  77. }
  78. close(F);
  79. }
  80. sub detect_lang {
  81. my($txt) = @_;
  82. my %lcnt;
  83. my @words;
  84. foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
  85. next if length $word < 3;
  86. $word = lc($word);
  87. push @words, $word;
  88. my $lang;
  89. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  90. while(my ($l) = $q->fetchrow_array()) {
  91. $lcnt{$l}++;
  92. }
  93. }
  94. print Dumper(\%lcnt);
  95. my $max = 0;
  96. my $lmax;
  97. foreach my $lang ( keys %lcnt ) {
  98. $lmax = $lang if !defined $lmax;
  99. if ( $lcnt{$lang} > $max ) {
  100. $lmax = $lang;
  101. $max = $lcnt{$lang};
  102. }
  103. }
  104. return ($lmax, \@words);
  105. }
  106. sub update_page_status {
  107. my($pageid, $status) = @_;
  108. sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
  109. }
  110. sub get_new_page {
  111. my($docid) = @_;
  112. my $pageid;
  113. my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
  114. while(my($id)=$q->fetchrow_array()) {
  115. $pageid = $id;
  116. }
  117. return $pageid;
  118. }
  119. sub gen_uuid {
  120. my $ug = Data::UUID->new;
  121. return lc($ug->create_str());
  122. }
  123. sub load_conf {
  124. my($file) = @_;
  125. my $x='';
  126. open(F,"$file") || die "Failed to load configuration file";
  127. while(<F>) { $x.=$_; }
  128. close(F);
  129. return from_json($x);
  130. }
  131. sub sqlconnect {
  132. my($sql) = @_;
  133. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  134. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  135. die "Failed to connect to database";
  136. return $dbh;
  137. }
  138. sub sqlquery {
  139. my $dbh = shift;
  140. my $query = shift;
  141. my @args = @_;
  142. #print STDERR "$query\n";
  143. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  144. $sth->execute(@args) || die "Failed to execute SQL query";
  145. return $sth;
  146. }