Browse Source

finished pdf processing

master
root 5 years ago
parent
commit
f06d7e82c6
5 changed files with 86 additions and 37 deletions
  1. 71
    29
      bin/autodoc_process.pl
  2. 13
    0
      bin/dev_cleanup_db.pl
  3. 2
    2
      cgi/autodoc.fcgi
  4. 0
    3
      var/original/index.html
  5. 0
    3
      var/queue/index.html

+ 71
- 29
bin/autodoc_process.pl View File

use GD::Simple; use GD::Simple;
use Data::Dumper; use Data::Dumper;
use Data::UUID; use Data::UUID;
use File::Temp;
use warnings; use warnings;


$Data::Dumper::Sortkeys = 1; $Data::Dumper::Sortkeys = 1;
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};


my %langid;
my $q = sqlquery($dbh, "SELECT id,short FROM lang");
while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }

opendir(Q,$queuedir); opendir(Q,$queuedir);
foreach my $file ( readdir(Q) ) { foreach my $file ( readdir(Q) ) {
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
# end of PDF # end of PDF
last if $?; last if $?;


my %lang = detect_lang($txt);
my ($lang,$words) = detect_lang($txt);
print "language is $lang\n";
#spell_check($txt,$lang);
my $pageid = get_new_page($docid); my $pageid = get_new_page($docid);
print "new page id $pageid\n"; print "new page id $pageid\n";
update_page_status($pageid, 'inprogress');


system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
#system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
print "create original page jpeg $pageid.jpeg"; print "create original page jpeg $pageid.jpeg";
system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
$originaldir, $pageid,
$originaldir, $pageid ));

create_page_words($pageid, $lang, $words);

update_page_status($pageid, 'ok');

} }
} }
else { else {
# ocr / lang detect # ocr / lang detect
# update db # update db


sub create_page_words {
my($pageid, $lang, $words) = @_;

foreach my $word ( @{$words} ) {
sqlquery($dbh, "CALL add_page_word(?,?,?)",
$pageid, $word, $langid{$lang});
}

}

sub spell_check {
my($txt, $lang) = @_;

my $tmp = File::Temp->new();
print $tmp $txt;

open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
while(<F>) {
print $_;
}
close(F);
}


sub detect_lang { sub detect_lang {
my($txt) = @_; my($txt) = @_;


my @larr;
my %lcnt; my %lcnt;
my @words;


foreach my $word ( split(/[ '".-]/,$txt) ) {
foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
next if length $word < 3;
$word = lc($word); $word = lc($word);
push @words, $word;

my $lang; my $lang;
print "$word ";
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
while(my ($l) = $q->fetchrow_array()) { while(my ($l) = $q->fetchrow_array()) {
print "$l ";
$lang = $l;
$lcnt{$l}++;
} }
print "\n";

push @larr, $lang if defined $lang;
$lcnt{$lang}++ if defined $lang;
} }


print Dumper(\%lcnt); print Dumper(\%lcnt);


my $max = 0;
my $lmax;
foreach my $lang ( keys %lcnt ) {
$lmax = $lang if !defined $lmax;
if ( $lcnt{$lang} > $max ) {
$lmax = $lang;
$max = $lcnt{$lang};
}
}

return ($lmax, \@words);
}

sub update_page_status {
my($pageid, $status) = @_;

sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
} }


sub get_new_page { sub get_new_page {
my($docid) = @_; my($docid) = @_;


my $pageid = gen_uuid();

sqlquery($dbh, "
INSERT INTO
pages
SET
id = ?,
created = NOW(),
status = 'inprogress'",
$pageid);

sqlquery($dbh, "
INSERT INTO
documents_pages
SET
documentId = ?,
pageId = ?", $docid, $pageid);
my $pageid;

my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
while(my($id)=$q->fetchrow_array()) {
$pageid = $id;
}


return $pageid; return $pageid;
} }

+ 13
- 0
bin/dev_cleanup_db.pl View File

sqlquery($dbh, "DELETE FROM $_"); sqlquery($dbh, "DELETE FROM $_");
} }


my $q = sqlquery($dbh, "SHOW TABLE STATUS");
while(my $h = $q->fetchrow_hashref()) {
print "="x80 . "\n";
foreach my $name ( sort keys %{$h} ) {
next if $name !~ /^(Name|Rows)$/;
printf("%10s: %s\n",
$name,
defined $h->{$name} ? $h->{$name} : 'NULL'
);
}
}


sub load_conf { sub load_conf {
my($file) = @_; my($file) = @_;



+ 2
- 2
cgi/autodoc.fcgi View File

my $q = sqlquery($dbh, "SELECT * FROM documents WHERE id = ?", $id); my $q = sqlquery($dbh, "SELECT * FROM documents WHERE id = ?", $id);
while(my $hash = $q->fetchrow_hashref()) { $document = $hash; } while(my $hash = $q->fetchrow_hashref()) { $document = $hash; }


$q = sqlquery($dbh, "SELECT * FROM pages WHERE documentId = ?", $id);
while(my $hash = $q->fetchrow_hashref()) { push @pages, $hash; push @pageids, $hash->{id}; }
#$q = sqlquery($dbh, "SELECT * FROM pages WHERE documentId = ?", $id);
#while(my $hash = $q->fetchrow_hashref()) { push @pages, $hash; push @pageids, $hash->{id}; }


$q = sqlquery($dbh, " $q = sqlquery($dbh, "
SELECT tags.tag AS tag SELECT tags.tag AS tag

+ 0
- 3
var/original/index.html View File

<html>
<body>Nothing here</body>
</html>

+ 0
- 3
var/queue/index.html View File

<html>
<body>Nothing here</body>
</html>

Loading…
Cancel
Save