An unfinished system to manage all your paper documentation in an easy way.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

autodoc_process.pl 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use warnings;
  9. $Data::Dumper::Sortkeys = 1;
  10. my $conf = load_conf("../etc/autodoc.json");
  11. my $dbh = sqlconnect($conf->{sql});
  12. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  13. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  14. opendir(Q,$queuedir);
  15. foreach my $file ( readdir(Q) ) {
  16. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  17. my $docid = $1;
  18. my $ext = $3;
  19. print "Found document id $docid of type $ext\n";
  20. if ( $ext eq 'pdf' ) {
  21. my @pages;
  22. for(my $page=0;; $page++) {
  23. my $txt = '';
  24. print "texting page $page\n";
  25. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  26. while(<TXT>) {
  27. chomp;
  28. $txt .= ' ' . $_;
  29. }
  30. close(TXT);
  31. # end of PDF
  32. last if $?;
  33. my %lang = detect_lang($txt);
  34. my $pageid = get_new_page($docid);
  35. print "new page id $pageid\n";
  36. system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  37. #system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
  38. print "create original page jpeg $pageid.jpeg";
  39. }
  40. }
  41. else {
  42. }
  43. }
  44. }
  45. closedir(Q);
  46. # open queue
  47. # decompose PDF
  48. # normalise all files as jpegs
  49. # generate page
  50. # ocr / lang detect
  51. # update db
  52. sub detect_lang {
  53. my($txt) = @_;
  54. my @larr;
  55. my %lcnt;
  56. foreach my $word ( split(/ /,$txt) ) {
  57. $word = lc($word);
  58. my $lang;
  59. print "$word ";
  60. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  61. while(my ($l) = $q->fetchrow_array()) {
  62. print "$lang ";
  63. $lang = $l;
  64. }
  65. print "\n";
  66. push @larr, $lang if defined $lang;
  67. $lcnt{$lang}++ if defined $lang;
  68. }
  69. print Dumper(\%lcnt);
  70. }
  71. sub get_new_page {
  72. my($docid) = @_;
  73. my $pageid = gen_uuid();
  74. sqlquery($dbh, "
  75. INSERT INTO
  76. pages
  77. SET
  78. id = ?,
  79. created = NOW(),
  80. status = 'inprogress'",
  81. $pageid);
  82. sqlquery($dbh, "
  83. INSERT INTO
  84. documents_pages
  85. SET
  86. documentId = ?,
  87. pageId = ?", $docid, $pageid);
  88. return $pageid;
  89. }
  90. sub gen_uuid {
  91. my $ug = Data::UUID->new;
  92. return lc($ug->create_str());
  93. }
  94. sub load_conf {
  95. my($file) = @_;
  96. my $x='';
  97. open(F,"$file") || die "Failed to load configuration file";
  98. while(<F>) { $x.=$_; }
  99. close(F);
  100. return from_json($x);
  101. }
  102. sub sqlconnect {
  103. my($sql) = @_;
  104. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  105. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  106. die "Failed to connect to database";
  107. return $dbh;
  108. }
  109. sub sqlquery {
  110. my $dbh = shift;
  111. my $query = shift;
  112. my @args = @_;
  113. #print STDERR "$query\n";
  114. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  115. $sth->execute(@args) || die "Failed to execute SQL query";
  116. return $sth;
  117. }