function Pdf2TextClass(){
var self = this;
this.complete = 0;
/**
*
* @param data ArrayBuffer of the pdf file content
* @param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done;
* 2) total number of pages in file.
* @param callbackAllDone The input parameter of callback function is
* the result of extracted text from pdf file.
*
*/
this.pdfToText = function(data, callbackPageDone, callbackAllDone){
console.assert( data instanceof ArrayBuffer || typeof data == 'string' );
PDFJS.getDocument( data ).then( function(pdf) {
var div = document.getElementById('viewer');
var total = pdf.numPages;
callbackPageDone( 0, total );
var layers = {};
for (i = 1; i <= total; i++){
pdf.getPage(i).then( function(page){
var n = page.pageNumber;
page.getTextContent().then( function(textContent){
if( null != textContent.bidiTexts ){
var page_text = "";
var last_block = null;
for( var k = 0; k < textContent.bidiTexts.length; k++ ){
var block = textContent.bidiTexts[k];
if( last_block != null && last_block.str[last_block.str.length-1] != ' '){
if( block.x < last_block.x )
page_text += "\r\n";
else if ( last_block.y != block.y && ( last_block.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null ))
page_text += ' ';
}
page_text += block.str;
last_block = block;
}
textContent != null && console.log("page " + n + " finished."); //" content: \n" + page_text);
layers[n] = page_text + "\n\n";
}
++ self.complete;
callbackPageDone( self.complete, total );
if (self.complete == total){
window.setTimeout(function(){
var full_text = "";
var num_pages = Object.keys(layers).length;
for( var j = 1; j <= num_pages; j++)
full_text += layers[j] ;
callbackAllDone(full_text);
}, 1000);
}
}); // end of page.getTextContent().then
}); // end of page.then
} // of for
});
}; // end of pdfToText()
}; // end of class
var input = document.getElementById("input");
var processor = document.getElementById("processor");
var output = document.getElementById("output");
// listen for messages from the processor
window.addEventListener("message", function(event){
if (event.source != processor.contentWindow) return;
switch (event.data){
// "ready" = the processor is ready, so fetch the PDF file
case "ready":
var xhr = new XMLHttpRequest;
xhr.open('GET', input.getAttribute("src"), true);
xhr.responseType = "arraybuffer";
xhr.onload = function(event) {
processor.contentWindow.postMessage(this.response, "*");
};
xhr.send();
break;
// anything else = the processor has returned the text of the PDF
default:
output.textContent = event.data.replace(/\s+/g, " ");
break;
}
}, true);
// This file is called myPDFfileToText.js and is in the root folder
let PDFJS = require('pdfjs-dist');
let pathToPDF = 'path/to/myPDFfileToText.pdf';
let toText = Pdf2TextObj();
let onPageDone = function() {}; // don't want to do anything between pages
let onFinish = function(fullText) { console.log(fullText) };
toText.pdfToText(pathToPDF, onPageDone, onFinish);
function Pdf2TextObj() {
let self = this;
this.complete = 0;
/**
*
* @param path Path to the pdf file.
* @param callbackPageDone To inform the progress each time
* when a page is finished. The callback function's input parameters are:
* 1) number of pages done.
* 2) total number of pages in file.
* 3) the `page` object itself or null.
* @param callbackAllDone Called after all text has been collected. Input parameters:
* 1) full text of parsed pdf.
*
*/
this.pdfToText = function(path, callbackPageDone, callbackAllDone) {
// console.assert(typeof path == 'string');
PDFJS.getDocument(path).promise.then(function(pdf) {
let total = pdf.numPages;
callbackPageDone(0, total, null);
let pages = {};
// For some (pdf?) reason these don't all come in consecutive
// order. That's why they're stored as an object and then
// processed one final time at the end.
for (let pagei = 1; pagei <= total; pagei++) {
pdf.getPage(pagei).then(function(page) {
let pageNumber = page.pageNumber;
page.getTextContent().then(function(textContent) {
if (null != textContent.items) {
let page_text = "";
let last_item = null;
for (let itemsi = 0; itemsi < textContent.items.length; itemsi++) {
let item = textContent.items[itemsi];
// I think to add whitespace properly would be more complex and
// would require two loops.
if (last_item != null && last_item.str[last_item.str.length - 1] != ' ') {
let itemX = item.transform[5]
let lastItemX = last_item.transform[5]
let itemY = item.transform[4]
let lastItemY = last_item.transform[4]
if (itemX < lastItemX)
page_text += "\r\n";
else if (itemY != lastItemY && (last_item.str.match(/^(\s?[a-zA-Z])$|^(.+\s[a-zA-Z])$/) == null))
page_text += ' ';
} // ends if may need to add whitespace
page_text += item.str;
last_item = item;
} // ends for every item of text
textContent != null && console.log("page " + pageNumber + " finished.") // " content: \n" + page_text);
pages[pageNumber] = page_text + "\n\n";
} // ends if has items
++self.complete;
callbackPageDone(self.complete, total, page);
// If all done, put pages in order and combine all
// text, then pass that to the callback
if (self.complete == total) {
// Using `setTimeout()` isn't a stable way of making sure
// the process has finished. Watch out for missed pages.
// A future version might do this with promises.
setTimeout(function() {
let full_text = "";
let num_pages = Object.keys(pages).length;
for (let pageNum = 1; pageNum <= num_pages; pageNum++)
full_text += pages[pageNum];
callbackAllDone(full_text);
}, 1000);
}
}); // ends page.getTextContent().then
}); // ends page.then
} // ends for every page
});
}; // Ends pdfToText()
return self;
}; // Ends object factory
/**
* Created by velten on 25.04.16.
*/
"use strict";
let pdfUrl = "http://example.com/example.pdf";
let request = require('request');
var pdfParser = require('pdf2json');
let pdfPipe = request({url: pdfUrl, encoding:null}).pipe(pdfParser);
pdfPipe.on("pdfParser_dataError", err => console.error(err) );
pdfPipe.on("pdfParser_dataReady", pdf => {
//optionally:
//let pdf = pdfParser.getMergedTextBlocksIfNeeded();
let count1 = 0;
//get text on a particular page
for (let page of pdf.formImage.Pages) {
count1 += page.Texts.length;
}
console.log(count1);
pdfParser.destroy();
});
8条答案
按热度按时间wooyq4lh1#
因为pdf.js这些年一直在发展,所以我想给予一个新的答案,就是可以在本地做,不需要涉及任何服务器或者外部服务,新的pdf.js有一个功能:getTextContent()。你可以从中得到文本内容。我已经用下面的代码成功地做到了这一点。
1.你在每一步中得到的都是一个承诺。你需要这样编码:
.then( function(){...})
以继续下一步。PDFJS.getDocument( data ).then( function(pdf) {
pdf.getPage(i).then( function(page){
page.getTextContent().then( function(textContent){
1.最后得到的是一个字符串数组
textContent.bidiTexts[]
。将它们连接起来得到一页的文本。文本块的坐标用于判断是否需要插入换行符或空格。(这可能不完全健壮,但从我的测试来看似乎是可以的。)1.输入参数
data
需要是URL或ArrayBuffer类型的数据,我使用了FileReader
API中的ReadAsArrayBuffer(file)函数来获取数据。**注意:**根据其他用户的说法,库已更新并导致代码中断。根据下面 async5 的注解,您需要将
textContent.bidiTexts
替换为textContent.items
。2j4z5cfb2#
我无法使gm 2008的示例工作(pdf.js上的内部数据结构发生了明显的变化),所以我编写了自己的完全基于承诺的解决方案,不使用任何DOM元素、queryselectors或canvas,使用mozilla示例中更新的pdf.js
因为我在node-webkit中使用它,所以它会吃掉上传的文件路径。你需要确保你下载了cmaps并指向某个地方,你需要pdf.js和pdf.worker.js来让它工作。
用法:
6vl6ewon3#
下面是一些JavaScript代码,使用http://hublog.hubmed.org/archives/001948.html中的Pdf.js可以完成您想要的任务:
......这里有一个例子:
http://git.macropus.org/2011/11/pdftotext/example/
ukdjmx9f4#
只留下一个完整的样本
bfnvny8b5#
注意:这段代码假设你使用的是nodejs,这意味着你解析的是本地文件而不是网页文件,因为最初的问题并没有明确询问解析网页上的pdf文件。
@gm2008的答案是一个很好的起点(请阅读它和它的评论以获得更多信息),但需要一些更新(08/19),并有一些未使用的代码。我也喜欢更完整的例子。有更多的重构和调整可以做(例如,与
await
),但现在它是尽可能接近原来的答案。和以前一样,这里使用Mozilla的PDFjs库,npmjs包位于https://www.npmjs.com/package/pdfjs-dist。
根据我的经验,这在查找放置空格的位置方面做得不好,但这是另一个问题。
[Edit:我相信对
.transform
使用的更新已将空白恢复为原来的行为。]在终端中运行:
node myPDFfileToText.js
u59ebvdq6#
2021年2月更新
b0zn9rqh7#
对于所有实际希望在节点服务器上使用它的人:
wljmcqd88#
有可能,但:
所以如果你有一些空闲时间你可以学习pdf格式并自己写这样一个库,或者你当然可以只使用服务器端库。