是否有任何 UpToDate 可能的方法通过 Javascript 将 PDF/DocX 转换为文本

Is there any UpToDate possible way to transform PDF/DocX into Text via Javascript

window.onload = function() {
    var myResume = document.getElementById('myResume');
    var displayResume = document.getElementById('displayResume');

    myResume.addEventListener('change', function(e) {
        var resume = myResume.files[0];
        var textType = /text.*/;
        var imageType = /image.*/;

        if (resume.type.match(textType)) {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerText = reader.result;
            }
            reader.readAsText(resume);
        }
        else if (resume.type.match(imageType)) {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerHTML = "";
                var img = new Image();
                img.src = reader.result;
                var string = OCRAD(img);
                alert(string);
            }
            reader.readAsDataURL(resume);
        }
        else if (myResume.files[0].type === 'application/pdf') {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerHTML = "";
                var img = new Image();
                img.src = reader.result;
                Tesseract.recognize(img)
                    .progress(function  (p) { console.log('progress', p)    })
                    .then(function (result) { console.log('result', result) })
            }
            reader.readAsDataURL(resume);
        }
        else if (myResume.files[0].type === 'application/msword') {
            var reader = new FileReader();
            reader.onload = function(e) {
                displayResume.innerHTML = "";
                var img = new Image();
                img.src = reader.result;
            }
            reader.readAsDataURL(resume);
        }
        else
            displayResume.innerText = "Media type couldn't recognized.";
    });
}

$("#submitBTN").click(function() {
    if ( (myResume.files.length == 0) && (myCover.files.length == 0) )
        alert ("No files uploaded.");
});
.centralize {
    margin-left: auto;
    margin-right: auto;
    padding-top: 250px;
}
#displayResume {
  margin-top: 2em;
  width: 100%;
  overflow-x: auto;
}
.header-title {
    float: none !important;
    color: #FFF !important;
    text-align: center;
    width: 100%;
}
.navbar-header {
    width: 100%;
    text-align: center;
    padding-top: 25px;
    padding-bottom: 25px;
}
<!doctype html>
<html ng-app>
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="text/html">
    <meta name="description" content="">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">  
    <meta name="robots" content="all,follow">
    <title>Arete Human Resources</title>

    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" crossorigin="anonymous">
 <link href="css/ionic.min.css" rel="stylesheet"/>
    <link href="css/style.css" rel="stylesheet">
    <script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.2.23/angular.min.js"></script>
</head>
<body>
    <nav class="navbar navbar-inverse navbar-fixed-top">
      <div class="container">
        <div class="navbar-header">
          <a class="navbar-brand header-title">Arete Human Resources</a>
        </div>
      </div>
    </nav>
    
    <div class="container centralize">
        <div class="row">
            <div class="col-xs-5 col-sm-5 col-md-5 col-lg-5">
                <input type="file" id="myResume" accept=".txt,.doc,.docx,.pdf,.jpg">
            </div>
            <div class="col-xs-4 col-sm-4 col-md-4 col-lg-4">
                <input type="text" ng-model="fileName" placeholder="Your name here">
            </div>
            <div class="col-xs-3 col-sm-3 col-md-3 col-lg-3">
                <h4>{{fileName}}</h4>
            </div>
        </div>
        <div class="row">
            <div class="col-xs-5 col-sm-5 col-md-5 col-lg-5">
                <input type="file" id="myCover" accept=".txt,.doc,.docx,.pdf,.jpg">
            </div>
            <div class="col-xs-4 col-sm-4 col-md-4 col-lg-4">
                <input type="text" ng-model="fileDesc" placeholder="Explanations">
            </div>
            <div class="col-xs-3 col-sm-3 col-md-3 col-lg-3">
                <h5>{{fileDesc}}</h5>
            </div>
        </div>
        <hr>
        <button type="submit" class="button button-outline button-positive" id="submitBTN" onclick="" style="float:right">Submit</button>
        <div id="displayResume"></div>
    </div>

    <script src="http://code.jquery.com/jquery-latest.min.js"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
    <script src="js/ionic.min.js" type="text/javascript"></script>
    <script src="js/ionic.bundle.min.js" type="text/javascript"></script>
    <script src="js/ionic-angular.min.js" type="text/javascript"></script>
    <script src="js/ocrad.js" type="text/javascript"></script>
<!--<script src="js/require.js" type="text/javascript"></script>-->
    <script src='https://cdn.rawgit.com/naptha/tesseract.js/1.0.10/dist/tesseract.js'></script>
    <script src="js/main.js" type="text/javascript"></script>
</body>
</html>

我正在尝试从用户那里获取输入文件并尝试将文件更改为文本并稍后发送用户作为输入发送的任何内容。我认为主要输入是文本、pdf、docx 甚至图像。

我发现 ocrad for image and it works perfect, hence for pdf/docx it never worked somehow even if I tried with pdfreader, pdf-to-text, composer, pdf.js, docxtemplater, study.js, and many others as well. I tried to add libraries as usual on terminal with node and even I debugged, I couldn't find a way. Here is the codePen 下面是代码片段,如果有人批准目前已经在运行的现有库,我将很高兴。

抱歉,如果文档中没有明确解释,但 pdfreader 在 node.js 上是 运行,而不是在网络浏览器中。因此 require 问题。

以下是将您的 PDF 简历转换为文本的方法,来自 Node.js:

var pdfreader = require('pdfreader');

var rows = {}; // indexed by y-position

function printRows() {
  Object.keys(rows) // => array of y-positions (type: float)
    .sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions
    .forEach((y) => console.log((rows[y] || []).join('')));
}

new pdfreader.PdfReader().parseFileItems('CV_ErhanYasar.pdf', function(err, item){
  if (!item || item.page) {
    // end of file, or page
    printRows();
    console.log('PAGE:', item.page);
    rows = {}; // clear rows for next page
  }
  else if (item.text) {
    // accumulate text items into rows object, per line
    (rows[item.y] = rows[item.y] || []).push(item.text);
  }
});

=> 那么,这就是您将获得的输出: