An unfinished system to manage all your paper documentation in an easy way.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

autodoc_process.pl 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use warnings;
  9. $Data::Dumper::Sortkeys = 1;
  10. my $conf = load_conf("../etc/autodoc.json");
  11. my $dbh = sqlconnect($conf->{sql});
  12. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  13. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  14. opendir(Q,$queuedir);
  15. foreach my $file ( readdir(Q) ) {
  16. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  17. my $docid = $1;
  18. my $ext = $3;
  19. print "Found document id $docid of type $ext\n";
  20. if ( $ext eq 'pdf' ) {
  21. my @pages;
  22. for(my $page=0;; $page++) {
  23. my $txt = '';
  24. print "texting page $page\n";
  25. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  26. while(<TXT>) {
  27. chomp;
  28. $txt .= ' ' . $_;
  29. }
  30. close(TXT);
  31. # end of PDF
  32. last if $?;
  33. my %lang = detect_lang($txt);
  34. my $pageid = get_new_page($docid);
  35. print "new page id $pageid\n";
  36. system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  37. #system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
  38. print "create original page jpeg $pageid.jpeg";
  39. }
  40. }
  41. else {
  42. }
  43. }
  44. }
  45. closedir(Q);
  46. # open queue
  47. # decompose PDF
  48. # normalise all files as jpegs
  49. # generate page
  50. # ocr / lang detect
  51. # update db
  52. sub detect_lang {
  53. my($txt) = @_;
  54. my @larr;
  55. my %lcnt;
  56. foreach my $word ( split(/ /,$txt) ) {
  57. $word = lc($word);
  58. my $lang;
  59. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  60. while(my ($l) = $q->fetchrow_array()) {
  61. $lang = $l;
  62. }
  63. push @larr, $lang if defined $lang;
  64. $lcnt{$lang}++ if defined $lang;
  65. }
  66. print Dumper(\%lcnt);
  67. }
  68. sub get_new_page {
  69. my($docid) = @_;
  70. my $pageid = gen_uuid();
  71. sqlquery($dbh, "
  72. INSERT INTO
  73. pages
  74. SET
  75. id = ?,
  76. created = NOW(),
  77. status = 'inprogress'",
  78. $pageid);
  79. sqlquery($dbh, "
  80. INSERT INTO
  81. documents_pages
  82. SET
  83. documentId = ?,
  84. pageId = ?", $docid, $pageid);
  85. return $pageid;
  86. }
  87. sub gen_uuid {
  88. my $ug = Data::UUID->new;
  89. return lc($ug->create_str());
  90. }
  91. sub load_conf {
  92. my($file) = @_;
  93. my $x='';
  94. open(F,"$file") || die "Failed to load configuration file";
  95. while(<F>) { $x.=$_; }
  96. close(F);
  97. return from_json($x);
  98. }
  99. sub sqlconnect {
  100. my($sql) = @_;
  101. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  102. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  103. die "Failed to connect to database";
  104. return $dbh;
  105. }
  106. sub sqlquery {
  107. my $dbh = shift;
  108. my $query = shift;
  109. my @args = @_;
  110. #print STDERR "$query\n";
  111. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  112. $sth->execute(@args) || die "Failed to execute SQL query";
  113. return $sth;
  114. }