Ver código fonte

finished pdf processing

master
root 5 anos atrás
pai
commit
f06d7e82c6
5 arquivos alterados com 86 adições e 37 exclusões
  1. 71
    29
      bin/autodoc_process.pl
  2. 13
    0
      bin/dev_cleanup_db.pl
  3. 2
    2
      cgi/autodoc.fcgi
  4. 0
    3
      var/original/index.html
  5. 0
    3
      var/queue/index.html

+ 71
- 29
bin/autodoc_process.pl Ver arquivo

@@ -6,6 +6,7 @@ use DBI;
use GD::Simple;
use Data::Dumper;
use Data::UUID;
use File::Temp;
use warnings;

$Data::Dumper::Sortkeys = 1;
@@ -16,6 +17,10 @@ my $dbh = sqlconnect($conf->{sql});
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};

my %langid;
my $q = sqlquery($dbh, "SELECT id,short FROM lang");
while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }

opendir(Q,$queuedir);
foreach my $file ( readdir(Q) ) {
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
@@ -39,13 +44,23 @@ foreach my $file ( readdir(Q) ) {
# end of PDF
last if $?;

my %lang = detect_lang($txt);
my ($lang,$words) = detect_lang($txt);
print "language is $lang\n";
#spell_check($txt,$lang);
my $pageid = get_new_page($docid);
print "new page id $pageid\n";
update_page_status($pageid, 'inprogress');

system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
#system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
print "create original page jpeg $pageid.jpeg";
system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
$originaldir, $pageid,
$originaldir, $pageid ));

create_page_words($pageid, $lang, $words);

update_page_status($pageid, 'ok');

}
}
else {
@@ -61,51 +76,78 @@ closedir(Q);
# ocr / lang detect
# update db

sub create_page_words {
my($pageid, $lang, $words) = @_;

foreach my $word ( @{$words} ) {
sqlquery($dbh, "CALL add_page_word(?,?,?)",
$pageid, $word, $langid{$lang});
}

}

sub spell_check {
my($txt, $lang) = @_;

my $tmp = File::Temp->new();
print $tmp $txt;

open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
while(<F>) {
print $_;
}
close(F);
}


sub detect_lang {
my($txt) = @_;

my @larr;
my %lcnt;
my @words;

foreach my $word ( split(/[ '".-]/,$txt) ) {
foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
next if length $word < 3;
$word = lc($word);
push @words, $word;

my $lang;
print "$word ";
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
while(my ($l) = $q->fetchrow_array()) {
print "$l ";
$lang = $l;
$lcnt{$l}++;
}
print "\n";

push @larr, $lang if defined $lang;
$lcnt{$lang}++ if defined $lang;
}

print Dumper(\%lcnt);

my $max = 0;
my $lmax;
foreach my $lang ( keys %lcnt ) {
$lmax = $lang if !defined $lmax;
if ( $lcnt{$lang} > $max ) {
$lmax = $lang;
$max = $lcnt{$lang};
}
}

return ($lmax, \@words);
}

sub update_page_status {
my($pageid, $status) = @_;

sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
}

sub get_new_page {
my($docid) = @_;

my $pageid = gen_uuid();

sqlquery($dbh, "
INSERT INTO
pages
SET
id = ?,
created = NOW(),
status = 'inprogress'",
$pageid);

sqlquery($dbh, "
INSERT INTO
documents_pages
SET
documentId = ?,
pageId = ?", $docid, $pageid);
my $pageid;

my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
while(my($id)=$q->fetchrow_array()) {
$pageid = $id;
}

return $pageid;
}

+ 13
- 0
bin/dev_cleanup_db.pl Ver arquivo

@@ -20,6 +20,19 @@ foreach ( @del ) {
sqlquery($dbh, "DELETE FROM $_");
}

my $q = sqlquery($dbh, "SHOW TABLE STATUS");
while(my $h = $q->fetchrow_hashref()) {
print "="x80 . "\n";
foreach my $name ( sort keys %{$h} ) {
next if $name !~ /^(Name|Rows)$/;
printf("%10s: %s\n",
$name,
defined $h->{$name} ? $h->{$name} : 'NULL'
);
}
}


sub load_conf {
my($file) = @_;


+ 2
- 2
cgi/autodoc.fcgi Ver arquivo

@@ -141,8 +141,8 @@ sub db_get_document_object {
my $q = sqlquery($dbh, "SELECT * FROM documents WHERE id = ?", $id);
while(my $hash = $q->fetchrow_hashref()) { $document = $hash; }

$q = sqlquery($dbh, "SELECT * FROM pages WHERE documentId = ?", $id);
while(my $hash = $q->fetchrow_hashref()) { push @pages, $hash; push @pageids, $hash->{id}; }
#$q = sqlquery($dbh, "SELECT * FROM pages WHERE documentId = ?", $id);
#while(my $hash = $q->fetchrow_hashref()) { push @pages, $hash; push @pageids, $hash->{id}; }

$q = sqlquery($dbh, "
SELECT tags.tag AS tag

+ 0
- 3
var/original/index.html Ver arquivo

@@ -1,3 +0,0 @@
<html>
<body>Nothing here</body>
</html>

+ 0
- 3
var/queue/index.html Ver arquivo

@@ -1,3 +0,0 @@
<html>
<body>Nothing here</body>
</html>

Carregando…
Cancelar
Salvar