浏览代码

pdf txt extraction

master
root 5 年前
父节点
当前提交
10ff99f344
共有 3 个文件被更改,包括 23 次插入24 次删除
  1. 20
    24
      cgi/autodoc.fcgi
  2. 1
    0
      etc/autodoc.json.default
  3. 2
    0
      src/commands.txt

+ 20
- 24
cgi/autodoc.fcgi 查看文件

@@ -6,6 +6,7 @@ use JSON;
use DBI;
use GD::Simple;
use Data::Dumper;
use Data::UUID;
use warnings;

$Data::Dumper::Sortkeys = 1;
@@ -61,7 +62,7 @@ while($request->Accept() >= 0) {

my($code, $hdr, $type, $data) = process_query($method, $path, $qs, $post, $user);

print STDERR Dumper("code",$code,"hdr", $hdr, "type", $type, "data", $data);
#print STDERR Dumper("code",$code,"hdr", $hdr, "type", $type, "data", $data);

$hdr = [ ] if !defined $hdr;

@@ -74,6 +75,11 @@ while($request->Accept() >= 0) {
send_response($code, $hdr, $type, $data);
}

sub gen_uuid {
my $ug = Data::UUID->new;
return lc($ug->create_str());
}

sub send_response {
my ($code, $hdr, $type, $data) = @_;

@@ -254,7 +260,7 @@ sub get_page_image {
$size,
$dst
);
print STDERR "CMD=$cmd\n";
#print STDERR "CMD=$cmd\n";
system($cmd);
}

@@ -278,16 +284,15 @@ sub get_page_image {
sub api_v1_POST_documents {
my($id, $qs, $post, $user) = @_;

$id = gen_uuid();

my $q = sqlquery($dbh, "
INSERT INTO documents
SET
id = ?,
owner = ?,
status = 'nodata'
", $user);
$q = sqlquery($dbh, "SELECT LAST_INSERT_ID()");
while(my($lastid) = $q->fetchrow_array()) {
$id = $lastid;
}
", $id, $user);

return db_get_document_object($id);
}
@@ -295,24 +300,15 @@ sub api_v1_POST_documents {
sub api_v1_POST_documents_id_data {
my($id, $qs, $post, $user) = @_;

my $pageid;
my $uuid = gen_uuid();
my $ext = $post->{ctype};
$ext =~ s/.*\///;

sqlquery($dbh, "
INSERT INTO
pages
SET
owner = ?,
documentId = ?,
contenttype = ?,
created = NOW(),
status = 'inprogress'",
$user, $id, $post->{ctype});
my $q = sqlquery($dbh, "SELECT LAST_INSERT_ID()");
while(my($last) = $q->fetchrow_array()) {
$pageid = $last;
}
my $file = sprintf("%s/%s/%s_%s.%s",
$conf->{path}{global},
$conf->{path}{queue},
$id, $uuid, $ext);

my $file = $conf->{path}{global} . '/' . $conf->{path}{originals} . '/' . $pageid;
open(F,'>'.$file);
print F $post->{data};
close(F);
@@ -496,7 +492,7 @@ sub sqlquery {
my $query = shift;
my @args = @_;

print STDERR "$query\n";
#print STDERR "$query\n";

my $sth = $dbh->prepare($query) || fatal_api_error(500,"Failed to execute SQL query");
$sth->execute(@args) || fatal_api_error(500,"Failed to execute SQL query");

+ 1
- 0
etc/autodoc.json.default 查看文件

@@ -2,6 +2,7 @@
"path": {
"global": "/opt/autodoc",
"original": "var/original",
"queue": "var/queue",
"images": "var/images",
"cache": "var/cache",
"error_img": "var/error_img.jpeg"

+ 2
- 0
src/commands.txt 查看文件

@@ -1,5 +1,7 @@
apt-get install aspell-fr aspell-it aspell-de aspell-en
apt-get install wfrench wbritish-large witalian wswiss wngerman wamerican-large
apt-get install tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-ita tesseract-ocr-eng
apt-get install poppler-utils

cd /opt/autodoc/www/js/
npm install popper.js --save

正在加载...
取消
保存