Après avoir regardé autour des API OCR, nous en avons trouvé des très performantes mais toutes sont payantes et assez onéreuses surtout que nous sommes en phase de test. J'ai réalisé un bout de code JAVA qui lit un fichier FAX ou TIFF, pour le transformer en PDF via l’excellente api iText.
Une fois ce PDF produit je l'upload vers le nuage grâce aux Google Documents List Data API v3.0 pour exploiter la conversion par OCR.
Puis enfin je télécharge le document produit sur mon disque local.
En espérant que cela puisse aider !
public class FaxToPdfToGoogle { // Fax To PDF private final String fileToRead = "Document.fax"; // File to read private final String fileToExport = "DocumentToUpload.pdf"; // File produced to Google OCR private final Rectangle pageSize = PageSize.A4; // PDF To Upload resolution, a high resolution result in com.google.gdata.util.InvalidEntryException: Could not convert document. // PDF To Google private final String googleUsername = ""; // Your email of google account private final String googlePassword = ""; // your password of google account private final String fileNameInGoogle = "DocumentTest-FaxToPdfOcr.pdf"; // File produced to Google OCR private final DocsService client = new DocsService("enterprise-FaxToPdf-v1"); // Set here what you want as applicationName // Google to HD private final String fileFinal = "DocumentFinal.pdf"; // File to read public FaxToPdfToGoogle() throws Exception { // --------------------------------------------- // -- Produce PDF From Fax -- // --------------------------------------------- File f = new File("./"+fileToExport); System.out.println("Doc exist : "+f.exists()); createPDFFromTiff(f); System.out.println("Doc finished"); // --------------------------------------------- // -- Upload PDF To Google OCR API -- // --------------------------------------------- System.out.println("Doc upload start"); // if(!document.isOpen()){ client.setUserCredentials(googleUsername, googlePassword); DocumentListEntry uploadedEntry = uploadFileGApi("./"+fileToExport, fileNameInGoogle); final String docId = uploadedEntry.getDocId(); System.out.println("Document now online @ :" + docId); System.out.println("Doc upload finished "); // } // --------------------------------------------- // -- Download file converted doc from Google -- // --------------------------------------------- System.out.println("Doc download start"); if(downloadFileGApi("./"+fileFinal, docId)) System.out.println("Doc download success"); else System.out.println("Doc download failed"); } private void createPDFFromTiff(File pdf) throws DocumentException, IOException { if(!pdf.exists()) System.out.println("Doc creation : "+pdf.createNewFile()); Document document = new Document(pageSize,0,0,0,0); PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(pdf)); System.out.println("Doc resolution : " + document.getPageSize().toString()); document.open(); PdfContentByte c = writer.getDirectContent(); document.add(new Paragraph("Multipages tiff file")); System.out.println("Read "+fileToRead); RandomAccessFileOrArray ra = new RandomAccessFileOrArray(fileToRead); int pages = TiffImage.getNumberOfPages(ra); System.out.println("Nombre de pages "+pages); for(int i = 1; i <= pages; i++){ Image img = TiffImage.getTiffImage(ra, i); img.setAbsolutePosition(0f,0f); img.scaleAbsolute(pageSize.getWidth(),pageSize.getHeight()); c.addImage(img); c.getPdfDocument().newPage(); } document.close(); } private DocumentListEntry uploadFileGApi(String filepath, String title)throws IOException, ServiceException{ File file = new File(filepath); System.out.println("Doc to upload exist " + file.exists()); DocumentListEntry newDocument = new DocumentListEntry(); String mimeType = DocumentListEntry.MediaType.fromFileName(file.getName()).getMimeType(); newDocument.setFile(file, mimeType); newDocument.setTitle(new PlainTextConstruct(title)); return client.insert(new URL("https://docs.google.com/feeds/default/private/full?ocr=true"), newDocument); } private boolean downloadFileGApi(String filepath, String resourceId) throws IOException, ServiceException{ URL url = new URL("https://docs.google.com/feeds/default/private/full/" + resourceId); DocumentListEntry entry = client.getEntry(url, DocumentListEntry.class); if( entry != null ){ MediaContent mc = (MediaContent) entry.getContent(); String fileExtension = mc.getMimeType().getSubType(); String exportUrl = mc.getUri(); // PDF file cannot be exported in different formats. String requestedExtension = filepath.substring(filepath.lastIndexOf(".") + 1); if (!requestedExtension.equals(fileExtension)) { System.err.println("Warning: " + mc.getMimeType().getMediaType() + " cannot be downloaded as a " + requestedExtension + ". Using ." + fileExtension + " instead."); filepath = filepath.substring(0, filepath.lastIndexOf(".") + 1) + fileExtension; } downloadFile(exportUrl, filepath); return true; } else return false; } public void downloadFile(String exportUrl, String filepath) throws IOException, MalformedURLException, ServiceException { System.out.println("Exporting document from: " + exportUrl); MediaContent mc = new MediaContent(); mc.setUri(exportUrl); MediaSource ms = client.getMedia(mc); InputStream inStream = null; FileOutputStream outStream = null; try { inStream = ms.getInputStream(); outStream = new FileOutputStream(filepath); int c; while ((c = inStream.read()) != -1) { outStream.write(c); } } finally { if (inStream != null) { inStream.close(); } if (outStream != null) { outStream.flush(); outStream.close(); } } } public static void main(String[] args) { try{ FaxToPdfToGoogle simpleImages = new FaxToPdfToGoogle(); }catch(Exception e){ System.out.println(e); } } }Voici la liste des api utilisées :
/FaxToPdf/lib/iText-5.0.4.jar
/FaxToPdf/lib/apache-mime4j-0.6.jar
/FaxToPdf/lib/commons-codec-1.3.jar
/FaxToPdf/lib/commons-logging-1.1.1.jar
/FaxToPdf/lib/httpclient-4.0.3.jar
/FaxToPdf/lib/httpcore-4.0.1.jar
/FaxToPdf/lib/httpcore-nio-4.0.1.jar
/FaxToPdf/lib/httpmime-4.0.3.jar
/FaxToPdf/lib/commons-httpclient-3.1.jar
/FaxToPdf/lib/gdata-core-1.0.jar
/FaxToPdf/lib/gdata-client-meta-1.0.jar
/FaxToPdf/lib/gdata-client-1.0.jar
/FaxToPdf/lib/gdata-media-1.0.jar
/FaxToPdf/lib/gdata-docs-meta-3.0.jar
/FaxToPdf/lib/gdata-docs-3.0.jar
/FaxToPdf/lib/guava-r07.jar
/FaxToPdf/lib/mail.jar