An unfinished system to manage all your paper documentation in an easy way.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/perl
  2. use strict;
  3. use JSON;
  4. use DBI;
  5. use GD::Simple;
  6. use Data::Dumper;
  7. use Data::UUID;
  8. use warnings;
  9. $Data::Dumper::Sortkeys = 1;
  10. my $conf = load_conf("../etc/autodoc.json");
  11. my $dbh = sqlconnect($conf->{sql});
  12. my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
  13. my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
  14. opendir(Q,$queuedir);
  15. foreach my $file ( readdir(Q) ) {
  16. if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
  17. my $docid = $1;
  18. my $ext = $3;
  19. print "Found document id $docid of type $ext\n";
  20. if ( $ext eq 'pdf' ) {
  21. my @pages;
  22. for(my $page=0;; $page++) {
  23. my $txt = '';
  24. print "texting page $page\n";
  25. open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
  26. while(<TXT>) {
  27. chomp;
  28. $txt .= ' ' . $_;
  29. }
  30. close(TXT);
  31. # end of PDF
  32. last if $?;
  33. my %lang = detect_lang($txt);
  34. my $pageid = get_new_page($docid);
  35. print "new page id $pageid\n";
  36. system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
  37. #system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
  38. print "create original page jpeg $pageid.jpeg";
  39. }
  40. }
  41. else {
  42. }
  43. }
  44. }
  45. closedir(Q);
  46. # open queue
  47. # decompose PDF
  48. # normalise all files as jpegs
  49. # generate page
  50. # ocr / lang detect
  51. # update db
  52. sub detect_lang {
  53. my($txt) = @_;
  54. my @larr;
  55. my %lcnt;
  56. foreach my $word ( split(/[ '".-]/,$txt) ) {
  57. $word = lc($word);
  58. my $lang;
  59. print "$word ";
  60. my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
  61. while(my ($l) = $q->fetchrow_array()) {
  62. print "$l ";
  63. $lang = $l;
  64. }
  65. print "\n";
  66. push @larr, $lang if defined $lang;
  67. $lcnt{$lang}++ if defined $lang;
  68. }
  69. print Dumper(\%lcnt);
  70. }
  71. sub get_new_page {
  72. my($docid) = @_;
  73. my $pageid = gen_uuid();
  74. sqlquery($dbh, "
  75. INSERT INTO
  76. pages
  77. SET
  78. id = ?,
  79. created = NOW(),
  80. status = 'inprogress'",
  81. $pageid);
  82. sqlquery($dbh, "
  83. INSERT INTO
  84. documents_pages
  85. SET
  86. documentId = ?,
  87. pageId = ?", $docid, $pageid);
  88. return $pageid;
  89. }
  90. sub gen_uuid {
  91. my $ug = Data::UUID->new;
  92. return lc($ug->create_str());
  93. }
  94. sub load_conf {
  95. my($file) = @_;
  96. my $x='';
  97. open(F,"$file") || die "Failed to load configuration file";
  98. while(<F>) { $x.=$_; }
  99. close(F);
  100. return from_json($x);
  101. }
  102. sub sqlconnect {
  103. my($sql) = @_;
  104. my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
  105. my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
  106. die "Failed to connect to database";
  107. return $dbh;
  108. }
  109. sub sqlquery {
  110. my $dbh = shift;
  111. my $query = shift;
  112. my @args = @_;
  113. #print STDERR "$query\n";
  114. my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
  115. $sth->execute(@args) || die "Failed to execute SQL query";
  116. return $sth;
  117. }