An unfinished system to manage all your paper documentation in an easy way.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

autodoc_process.pl 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use warnings;
  9. $Data::Dumper::Sortkeys = 1;
  10. my $conf = load_conf("../etc/autodoc.json");
  11. my $dbh = sqlconnect($conf->{sql});
  12. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  13. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  14. opendir(Q,$queuedir);
  15. foreach my $file ( readdir(Q) ) {
  16. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  17. my $docid = $1;
  18. my $ext = $3;
  19. print "Found document id $docid of type $ext\n";
  20. if ( $ext eq 'pdf' ) {
  21. my @pages;
  22. for(my $page=0;; $page++) {
  23. my $txt = '';
  24. print "texting page $page\n";
  25. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  26. while(<TXT>) {
  27. chomp;
  28. $txt .= ' ' . $_;
  29. }
  30. close(TXT);
  31. # end of PDF
  32. last if $?;
  33. my %lang = detect_lang($txt);
  34. my $pageid = get_new_page($docid);
  35. print "new page id $pageid\n";
  36. system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  37. #system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
  38. print "create original page jpeg $pageid.jpeg";
  39. }
  40. }
  41. else {
  42. }
  43. }
  44. }
  45. closedir(Q);
  46. # open queue
  47. # decompose PDF
  48. # normalise all files as jpegs
  49. # generate page
  50. # ocr / lang detect
  51. # update db
  52. sub detect_lang {
  53. my($txt) = @_;
  54. my @larr;
  55. my %lcnt;
  56. foreach my $word ( split(/[ '".-]/,$txt) ) {
  57. $word = lc($word);
  58. my $lang;
  59. print "$word ";
  60. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  61. while(my ($l) = $q->fetchrow_array()) {
  62. print "$l ";
  63. $lang = $l;
  64. }
  65. print "\n";
  66. push @larr, $lang if defined $lang;
  67. $lcnt{$lang}++ if defined $lang;
  68. }
  69. print Dumper(\%lcnt);
  70. }
  71. sub get_new_page {
  72. my($docid) = @_;
  73. my $pageid = gen_uuid();
  74. sqlquery($dbh, "
  75. INSERT INTO
  76. pages
  77. SET
  78. id = ?,
  79. created = NOW(),
  80. status = 'inprogress'",
  81. $pageid);
  82. sqlquery($dbh, "
  83. INSERT INTO
  84. documents_pages
  85. SET
  86. documentId = ?,
  87. pageId = ?", $docid, $pageid);
  88. return $pageid;
  89. }
  90. sub gen_uuid {
  91. my $ug = Data::UUID->new;
  92. return lc($ug->create_str());
  93. }
  94. sub load_conf {
  95. my($file) = @_;
  96. my $x='';
  97. open(F,"$file") || die "Failed to load configuration file";
  98. while(<F>) { $x.=$_; }
  99. close(F);
  100. return from_json($x);
  101. }
  102. sub sqlconnect {
  103. my($sql) = @_;
  104. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  105. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  106. die "Failed to connect to database";
  107. return $dbh;
  108. }
  109. sub sqlquery {
  110. my $dbh = shift;
  111. my $query = shift;
  112. my @args = @_;
  113. #print STDERR "$query\n";
  114. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  115. $sth->execute(@args) || die "Failed to execute SQL query";
  116. return $sth;
  117. }