Rechercher

lundi 11 octobre 2010

Convertir un fax "image" en pdf "texte" via OCR

Pour un projet client, nous avons  besoin de convertir des fichiers .fax avec une technologie de type OCR afin de pouvoir lire et indexer ces fichiers par un serveur Google Search Appliance.

Après avoir regardé autour des API OCR, nous en avons trouvé des très performantes mais toutes sont payantes et assez onéreuses surtout que nous sommes en phase de test. J'ai réalisé un bout de code JAVA qui lit un fichier FAX ou TIFF, pour le transformer en PDF via l’excellente api iText.
Une fois ce PDF produit je l'upload vers le nuage grâce aux Google Documents List Data API v3.0 pour exploiter la conversion par OCR.
Puis enfin je télécharge le document produit sur mon disque local.
En espérant que cela puisse aider !


public class FaxToPdfToGoogle {

 // Fax To PDF 
 private final String fileToRead = "Document.fax"; // File to read
 private final String fileToExport = "DocumentToUpload.pdf"; // File produced to Google OCR
 private final Rectangle pageSize = PageSize.A4;  // PDF To Upload resolution, a high resolution result in com.google.gdata.util.InvalidEntryException: Could not convert document.

 // PDF To Google
 private final String googleUsername = ""; // Your email of google account
 private final String googlePassword = ""; // your password of google account
 private final String fileNameInGoogle = "DocumentTest-FaxToPdfOcr.pdf"; // File produced to Google OCR
 private final DocsService client = new DocsService("enterprise-FaxToPdf-v1"); // Set here what you want as applicationName
 
 // Google to HD
 private final String fileFinal = "DocumentFinal.pdf"; // File to read

 public FaxToPdfToGoogle() throws Exception {
  
  // ---------------------------------------------
  // --           Produce PDF From Fax          --
  // ---------------------------------------------
  
  File f = new File("./"+fileToExport);
  System.out.println("Doc exist : "+f.exists());
  createPDFFromTiff(f);
  System.out.println("Doc finished");
  

  // ---------------------------------------------
  // --      Upload PDF To Google OCR API       --
  // ---------------------------------------------
  
  System.out.println("Doc upload start");
//  if(!document.isOpen()){
   client.setUserCredentials(googleUsername, googlePassword);
   DocumentListEntry uploadedEntry = uploadFileGApi("./"+fileToExport, fileNameInGoogle);
   
   final String docId = uploadedEntry.getDocId();
   System.out.println("Document now online @ :" + docId);
   System.out.println("Doc upload finished ");
//  }
   

  // ---------------------------------------------
  // -- Download file converted doc from Google --
  // ---------------------------------------------

  System.out.println("Doc download start");
  if(downloadFileGApi("./"+fileFinal, docId))
   System.out.println("Doc download success");
  else 
   System.out.println("Doc download failed");
 }

 private void createPDFFromTiff(File pdf) throws DocumentException, IOException {
  if(!pdf.exists())
   System.out.println("Doc creation : "+pdf.createNewFile());
  
  Document document = new Document(pageSize,0,0,0,0);
  PdfWriter writer = PdfWriter.getInstance(document, new FileOutputStream(pdf));
  
  System.out.println("Doc resolution : " + document.getPageSize().toString());
  
  document.open();
  PdfContentByte c = writer.getDirectContent();
  document.add(new Paragraph("Multipages tiff file"));
  
  System.out.println("Read "+fileToRead);
  
  RandomAccessFileOrArray ra = new RandomAccessFileOrArray(fileToRead);
  int pages = TiffImage.getNumberOfPages(ra);
  
  System.out.println("Nombre de pages "+pages);
  
  for(int i = 1; i <= pages; i++){
   Image img = TiffImage.getTiffImage(ra, i);
   img.setAbsolutePosition(0f,0f);
   img.scaleAbsolute(pageSize.getWidth(),pageSize.getHeight());
   c.addImage(img);
   c.getPdfDocument().newPage();
  }

  document.close();
 }

 private DocumentListEntry uploadFileGApi(String filepath, String title)throws IOException, ServiceException{
    File file = new File(filepath);
    System.out.println("Doc to upload exist " + file.exists());
    DocumentListEntry newDocument = new DocumentListEntry();
    String mimeType = DocumentListEntry.MediaType.fromFileName(file.getName()).getMimeType();
    newDocument.setFile(file, mimeType);
    newDocument.setTitle(new PlainTextConstruct(title));
    
    return client.insert(new URL("https://docs.google.com/feeds/default/private/full?ocr=true"), newDocument);
 }
 
 private boolean downloadFileGApi(String filepath, String resourceId) throws IOException, ServiceException{
  
  URL url = new URL("https://docs.google.com/feeds/default/private/full/" + resourceId);
  DocumentListEntry entry = client.getEntry(url, DocumentListEntry.class);

  if( entry != null ){
   
     MediaContent mc = (MediaContent) entry.getContent();
     String fileExtension = mc.getMimeType().getSubType();
     String exportUrl = mc.getUri();
 
     // PDF file cannot be exported in different formats.
     String requestedExtension = filepath.substring(filepath.lastIndexOf(".") + 1);
     if (!requestedExtension.equals(fileExtension)) {
       System.err.println("Warning: " + mc.getMimeType().getMediaType() +
           " cannot be downloaded as a " + requestedExtension + ". Using ." +
           fileExtension + " instead.");
       filepath = filepath.substring(0, filepath.lastIndexOf(".") + 1) + fileExtension;
     }
     downloadFile(exportUrl, filepath);
     return true;
  }
  else 
   return false;
 }
 
 public void downloadFile(String exportUrl, String filepath) throws IOException, MalformedURLException, ServiceException {
    System.out.println("Exporting document from: " + exportUrl);
  
    MediaContent mc = new MediaContent();
    mc.setUri(exportUrl);
    MediaSource ms = client.getMedia(mc);
  
    InputStream inStream = null;
    FileOutputStream outStream = null;
  
    try {
       inStream = ms.getInputStream();
       outStream = new FileOutputStream(filepath);
   
       int c;
       while ((c = inStream.read()) != -1) {
         outStream.write(c);
       }
    } finally {
       if (inStream != null) {
         inStream.close();
       }
       if (outStream != null) {
         outStream.flush();
         outStream.close();
       }
    }
 }
 
 public static void main(String[] args) {
  try{
   FaxToPdfToGoogle simpleImages = new FaxToPdfToGoogle();
  }catch(Exception e){
   System.out.println(e);
  }
 }
}


Voici la liste des api utilisées :

/FaxToPdf/lib/iText-5.0.4.jar
/FaxToPdf/lib/apache-mime4j-0.6.jar
/FaxToPdf/lib/commons-codec-1.3.jar
/FaxToPdf/lib/commons-logging-1.1.1.jar
/FaxToPdf/lib/httpclient-4.0.3.jar
/FaxToPdf/lib/httpcore-4.0.1.jar
/FaxToPdf/lib/httpcore-nio-4.0.1.jar
/FaxToPdf/lib/httpmime-4.0.3.jar
/FaxToPdf/lib/commons-httpclient-3.1.jar
/FaxToPdf/lib/gdata-core-1.0.jar
/FaxToPdf/lib/gdata-client-meta-1.0.jar
/FaxToPdf/lib/gdata-client-1.0.jar
/FaxToPdf/lib/gdata-media-1.0.jar
/FaxToPdf/lib/gdata-docs-meta-3.0.jar
/FaxToPdf/lib/gdata-docs-3.0.jar
/FaxToPdf/lib/guava-r07.jar
/FaxToPdf/lib/mail.jar

Aucun commentaire:

Enregistrer un commentaire