001/* Copyright (C) 2014 konik.io
002 *
003 * This file is part of the Konik library.
004 *
005 * The Konik library is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU Affero General Public License as
007 * published by the Free Software Foundation, either version 3 of the
008 * License, or (at your option) any later version.
009 *
010 * The Konik library is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013 * GNU Affero General Public License for more details.
014 *
015 * You should have received a copy of the GNU Affero General Public License
016 * along with the Konik library. If not, see <http://www.gnu.org/licenses/>.
017 */
018package io.konik.carriage.itext;
019
020import static com.itextpdf.text.pdf.PdfName.EF;
021import static com.itextpdf.text.pdf.PdfName.F;
022import static com.itextpdf.text.pdf.PdfReader.getStreamBytes;
023import io.konik.harness.FileExtractor;
024import io.konik.harness.exception.InvoiceExtractionError;
025
026import java.io.ByteArrayInputStream;
027import java.io.IOException;
028import java.io.InputStream;
029
030import javax.inject.Named;
031import javax.inject.Singleton;
032
033import com.itextpdf.text.pdf.PRStream;
034import com.itextpdf.text.pdf.PdfArray;
035import com.itextpdf.text.pdf.PdfDictionary;
036import com.itextpdf.text.pdf.PdfName;
037import com.itextpdf.text.pdf.PdfReader;
038import com.itextpdf.text.pdf.PdfStream;
039
040/**
041 * The Class iText Pdf Invoice Extractor.
042 */
043@Named
044@Singleton
045public class ITextInvoiceExtractor implements FileExtractor {
046
047   private static final PdfName AF = new PdfName("AF");
048
049   @Override
050   public byte[] extract(InputStream pdfInput) {
051      
052      PdfReader reader = getPdfReader(pdfInput);
053      PdfArray af = getValidAf(reader.getCatalog());
054      PdfDictionary fileSpec = getValidFileSpec(af); 
055      PdfDictionary ef = getValidEf(fileSpec);
056      return getFStream(ef);
057   }
058   
059   @Override
060   public InputStream extractToStream(InputStream pdfInput) {
061     return new ByteArrayInputStream(extract(pdfInput));
062   }
063   
064
065   private static PdfReader getPdfReader(InputStream pdfStream) {
066      try {
067         return new PdfReader(pdfStream);
068      } catch (IOException e) {
069         throw new InvoiceExtractionError("Could not read or open pdf.",e);
070      }
071   }
072
073   private static PdfArray getValidAf(PdfDictionary catalog) {
074      if (catalog.contains(AF)) {
075         PdfArray af = catalog.getAsArray(AF);
076         if (!af.isEmpty() && af.getDirectObject(0).isDictionary()) {
077            return af;
078         }
079      }
080      throw new InvoiceExtractionError("Pdf catalog does not contain Valid AF Entry");
081   }
082   
083   private static PdfDictionary getValidFileSpec(PdfArray af) {
084      if (af.isEmpty() || af.getAsDict(0) == null) {
085         throw new InvoiceExtractionError("Pdf does not contain a FileSpec Entry");
086      }
087      return af.getAsDict(0);
088   }
089   
090   private static PdfDictionary getValidEf(PdfDictionary fileSpec) {
091      if (fileSpec.contains(EF)) {
092         return fileSpec.getAsDict(EF);
093      }
094      throw new InvoiceExtractionError("Pdf catalog does not contain Valid EF Entry");
095   }
096
097   private static byte[] getFStream(PdfDictionary ef){
098      if (ef.contains(F)) {
099         PdfStream xmlStream = ef.getAsStream(F);
100         try {
101            return getStreamBytes((PRStream) xmlStream);
102         } catch (IOException e) {
103            throw new InvoiceExtractionError("Could not extrac xml content form pdf.",e);
104         }
105      }
106      throw new InvoiceExtractionError("Pdf catalog does not contain Valid F Entry");
107   }
108
109
110}