com.lowagie.text.pdf
Class PdfReader

java.lang.Object
  extended bycom.lowagie.text.pdf.PdfReader
Direct Known Subclasses:
FdfReader

public class PdfReader
extends Object

Reads a PDF document and prepares it to import pages to our document. This class is thread safe; this means that a single instance can serve as many output documents as needed and can even be static.

Author:
Paulo Soares (psoares@consiste.pt), Kazuya Ujihara

Field Summary
protected  PRAcroForm acroForm
           
protected  boolean acroFormParsed
           
protected  PdfDictionary catalog
           
protected  boolean consolidateNamedDestinations
           
protected  PdfEncryption decrypt
           
protected  boolean encrypted
           
(package private) static byte[] endobj
           
(package private) static byte[] endstream
           
protected  int eofPos
           
protected  int freeXref
           
protected  int lastXref
           
private  IntHashtable newHits
           
private  int objGen
           
private  int objNum
           
protected  ArrayList pageInh
           
(package private) static PdfName[] pageInhCandidates
           
protected  PRIndirectReference[] pageRefs
           
protected  PdfDictionary[] pages
           
protected  int pagesCount
           
protected  byte[] password
           
protected  char pdfVersion
           
protected  int pValue
           
protected  boolean rebuilt
           
protected  int rValue
           
protected  boolean sharedStreams
           
protected  ArrayList strings
           
protected  boolean tampered
           
protected  PRTokeniser tokens
           
protected  PdfDictionary trailer
           
private  boolean[] visited
           
protected  int[] xref
           
protected  PdfObject[] xrefObj
           
 
Constructor Summary
protected PdfReader()
           
  PdfReader(byte[] pdfIn)
          Reads and parses a PDF document.
  PdfReader(byte[] pdfIn, byte[] ownerPassword)
          Reads and parses a PDF document.
  PdfReader(PdfReader reader)
          Creates an independent duplicate.
  PdfReader(String filename)
          Reads and parses a PDF document.
  PdfReader(String filename, byte[] ownerPassword)
          Reads and parses a PDF document.
  PdfReader(URL url)
          Reads and parses a PDF document.
  PdfReader(URL url, byte[] ownerPassword)
          Reads and parses a PDF document.
 
Method Summary
static byte[] ASCII85Decode(byte[] in)
          Decodes a stream that has the ASCII85Decode filter.
static byte[] ASCIIHexDecode(byte[] in)
          Decodes a stream that has the ASCIIHexDecode filter.
 void consolidateNamedDestinations()
          Replaces all the local named links with the actual destinations.
 int createFakeFontSubsets()
          Finds all the fonts not subset but embedded and marks them as subset.
protected static PdfDictionary duplicatePdfDictionary(PdfDictionary original, PdfDictionary copy, PdfReader newReader)
           
protected static PdfObject duplicatePdfObject(PdfObject original, PdfReader newReader)
           
 void eliminateSharedStreams()
          Eliminates shared streams if they exist.
private  boolean equalsArray(byte[] ar1, byte[] ar2, int size)
           
(package private) static boolean equalsn(byte[] a1, byte[] a2)
           
(package private) static boolean existsName(PdfDictionary dic, PdfName key, PdfName value)
           
static byte[] FlateDecode(byte[] in)
          Decodes a stream that has the FlateDecode filter.
static byte[] FlateDecode(byte[] in, boolean strict)
          A helper to FlateDecode.
 AcroFields getAcroFields()
          Gets a read-only version of AcroFields.
 PRAcroForm getAcroForm()
          Returns the document's acroform, if it has one.
 PdfDictionary getCatalog()
          Returns the document's catalog.
 Rectangle getCropBox(int index)
          Gets the crop box without taking rotation into account.
(package private)  PdfEncryption getDecrypt()
           
 int getEofPos()
           
(package private) static String getFontName(PdfDictionary dic)
           
 HashMap getInfo()
          Returns the content of the document information dictionary as a HashMap of String.
 int getLastXref()
           
 byte[] getMetadata()
          Gets the XML metadata.
private static PdfArray getNameArray(PdfObject obj)
           
 HashMap getNamedDestination()
           
 HashMap getNamedDestinationFromNames()
           
 HashMap getNamedDestinationFromStrings()
           
static Rectangle getNormalizedRectangle(PdfArray box)
          Normalizes a Rectangle so that llx and lly are smaller than urx and ury.
 int getNumberOfPages()
          Gets the number of pages in the document.
 byte[] getPageContent(int pageNum, RandomAccessFileOrArray file)
          Gets the contents of the page.
 PdfDictionary getPageN(int pageNum)
          Gets the dictionary that represents a page.
 PRIndirectReference getPageOrigRef(int pageNum)
          Gets the page reference to this page.
 int getPageRotation(int index)
          Gets the page rotation.
 Rectangle getPageSize(int index)
          Gets the page size without taking rotation into account.
 Rectangle getPageSizeWithRotation(int index)
          Gets the page size, taking rotation into account.
static PdfObject getPdfObject(PdfObject obj)
          Reads a PdfObject resolving an indirect reference if needed.
protected  PdfReaderInstance getPdfReaderInstance(PdfWriter writer)
           
 char getPdfVersion()
           
 int getPermissions()
           
 RandomAccessFileOrArray getSafeFile()
          Gets a new file instance of the original PDF document.
static byte[] getStreamBytes(PRStream stream, RandomAccessFileOrArray file)
          Get the content from a stream.
(package private) static String getSubsetPrefix(PdfDictionary dic)
           
 PdfDictionary getTrailer()
           
 int getXrefSize()
           
 boolean is128Key()
           
 boolean isEncrypted()
           
 boolean isRebuilt()
          Checks if the document had errors and was rebuilt.
 boolean isTampered()
          Checks if the document was changed.
private  void iterateBookmarks(PdfDictionary outline, HashMap names)
           
protected  void iteratePages(PdfDictionary page)
           
(package private) static PdfObject killIndirect(PdfObject obj)
           
protected  void killXref(PdfObject obj)
           
static byte[] LZWDecode(byte[] in)
          Decodes a stream that has the LZWDecode filter.
protected  void popPageAttributes()
           
protected  void PRSimpleRecursive(PdfObject obj)
           
protected  void pushPageAttributes(PdfDictionary nodePages)
           
protected  PdfArray readArray()
           
private  void readDecryptedDocObj()
           
protected  PdfDictionary readDictionary()
           
protected  void readDocObj()
           
protected  void readPages()
           
protected  void readPdf()
           
protected  PdfObject readPRObject()
           
protected  void readXref()
           
protected  void readXrefSection()
           
protected  void rebuildXref()
           
 void removeAnnotations()
          Removes all the annotations and fields from the document.
 void removeFields()
          Removes all the fields from the document.
protected  void removeUnusedNode(PdfObject obj, boolean[] hits)
           
 int removeUnusedObjects()
          Removes all the unreachable objects.
private static void replaceNamedDestination(PdfObject obj, HashMap names)
           
 void setPageContent(int pageNum, byte[] content)
          Sets the contents of the page.
 void setTampered(boolean tampered)
           
 int shuffleSubsetNames()
          Finds all the font subsets and changes the prefixes to some random values.
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

pageInhCandidates

static final PdfName[] pageInhCandidates

endstream

static final byte[] endstream

endobj

static final byte[] endobj

tokens

protected PRTokeniser tokens

xref

protected int[] xref

xrefObj

protected PdfObject[] xrefObj

trailer

protected PdfDictionary trailer

pages

protected PdfDictionary[] pages

catalog

protected PdfDictionary catalog

pageRefs

protected PRIndirectReference[] pageRefs

acroForm

protected PRAcroForm acroForm

acroFormParsed

protected boolean acroFormParsed

pageInh

protected ArrayList pageInh

pagesCount

protected int pagesCount

encrypted

protected boolean encrypted

rebuilt

protected boolean rebuilt

freeXref

protected int freeXref

tampered

protected boolean tampered

lastXref

protected int lastXref

eofPos

protected int eofPos

pdfVersion

protected char pdfVersion

decrypt

protected PdfEncryption decrypt

password

protected byte[] password

strings

protected ArrayList strings

sharedStreams

protected boolean sharedStreams

consolidateNamedDestinations

protected boolean consolidateNamedDestinations

rValue

protected int rValue

pValue

protected int pValue

objNum

private int objNum

objGen

private int objGen

visited

private boolean[] visited

newHits

private IntHashtable newHits
Constructor Detail

PdfReader

protected PdfReader()

PdfReader

public PdfReader(String filename)
          throws IOException
Reads and parses a PDF document.

Parameters:
filename - the file name of the document
Throws:
IOException - on error

PdfReader

public PdfReader(String filename,
                 byte[] ownerPassword)
          throws IOException
Reads and parses a PDF document.

Parameters:
filename - the file name of the document
ownerPassword - the password to read the document
Throws:
IOException - on error

PdfReader

public PdfReader(byte[] pdfIn)
          throws IOException
Reads and parses a PDF document.

Parameters:
pdfIn - the byte array with the document
Throws:
IOException - on error

PdfReader

public PdfReader(byte[] pdfIn,
                 byte[] ownerPassword)
          throws IOException
Reads and parses a PDF document.

Parameters:
pdfIn - the byte array with the document
ownerPassword - the password to read the document
Throws:
IOException - on error

PdfReader

public PdfReader(URL url)
          throws IOException
Reads and parses a PDF document.

Parameters:
url - the URL of the document
Throws:
IOException - on error

PdfReader

public PdfReader(URL url,
                 byte[] ownerPassword)
          throws IOException
Reads and parses a PDF document.

Parameters:
url - the URL of the document
ownerPassword - the password to read the document
Throws:
IOException - on error

PdfReader

public PdfReader(PdfReader reader)
Creates an independent duplicate.

Parameters:
reader - the PdfReader to duplicate
Method Detail

getSafeFile

public RandomAccessFileOrArray getSafeFile()
Gets a new file instance of the original PDF document.

Returns:
a new file instance of the original PDF document

getPdfReaderInstance

protected PdfReaderInstance getPdfReaderInstance(PdfWriter writer)

getNumberOfPages

public int getNumberOfPages()
Gets the number of pages in the document.

Returns:
the number of pages in the document

getCatalog

public PdfDictionary getCatalog()
Returns the document's catalog. This dictionary is not a copy, any changes will be reflected in the catalog.

Returns:
the document's catalog

getAcroForm

public PRAcroForm getAcroForm()
Returns the document's acroform, if it has one.

Returns:
he document's acroform

getPageRotation

public int getPageRotation(int index)
Gets the page rotation. This value can be 0, 90, 180 or 270.

Parameters:
index - the page number. The first page is 1
Returns:
the page rotation

getPageSizeWithRotation

public Rectangle getPageSizeWithRotation(int index)
Gets the page size, taking rotation into account. This is a Rectangle with the value of the /MediaBox and the /Rotate key.

Parameters:
index - the page number. The first page is 1
Returns:
a Rectangle

getPageSize

public Rectangle getPageSize(int index)
Gets the page size without taking rotation into account. This is the value of the /MediaBox key.

Parameters:
index - the page number. The first page is 1
Returns:
the page size

getCropBox

public Rectangle getCropBox(int index)
Gets the crop box without taking rotation into account. This is the value of the /CropBox key. The crop box is the part of the document to be displayed or printed. It usually is the same as the media box but may be smaller.

Parameters:
index - the page number. The first page is 1
Returns:
the crop box

getInfo

public HashMap getInfo()
Returns the content of the document information dictionary as a HashMap of String.

Returns:
content of the document information dictionary

getNormalizedRectangle

public static Rectangle getNormalizedRectangle(PdfArray box)
Normalizes a Rectangle so that llx and lly are smaller than urx and ury.

Parameters:
box - the original rectangle
Returns:
a normalized Rectangle

readPdf

protected void readPdf()
                throws IOException
Throws:
IOException

equalsArray

private boolean equalsArray(byte[] ar1,
                            byte[] ar2,
                            int size)

readDecryptedDocObj

private void readDecryptedDocObj()
                          throws IOException
Throws:
IOException

getPdfObject

public static PdfObject getPdfObject(PdfObject obj)
Reads a PdfObject resolving an indirect reference if needed.

Parameters:
obj - the PdfObject to read
Returns:
the resolved PdfObject

pushPageAttributes

protected void pushPageAttributes(PdfDictionary nodePages)

popPageAttributes

protected void popPageAttributes()

iteratePages

protected void iteratePages(PdfDictionary page)
                     throws IOException
Throws:
IOException

readPages

protected void readPages()
                  throws IOException
Throws:
IOException

PRSimpleRecursive

protected void PRSimpleRecursive(PdfObject obj)
                          throws IOException
Throws:
IOException

readDocObj

protected void readDocObj()
                   throws IOException
Throws:
IOException

killIndirect

static PdfObject killIndirect(PdfObject obj)

readXref

protected void readXref()
                 throws IOException
Throws:
IOException

readXrefSection

protected void readXrefSection()
                        throws IOException
Throws:
IOException

rebuildXref

protected void rebuildXref()
                    throws IOException
Throws:
IOException

readDictionary

protected PdfDictionary readDictionary()
                                throws IOException
Throws:
IOException

readArray

protected PdfArray readArray()
                      throws IOException
Throws:
IOException

readPRObject

protected PdfObject readPRObject()
                          throws IOException
Throws:
IOException

FlateDecode

public static byte[] FlateDecode(byte[] in)
Decodes a stream that has the FlateDecode filter.

Parameters:
in - the input data
Returns:
the decoded data

FlateDecode

public static byte[] FlateDecode(byte[] in,
                                 boolean strict)
A helper to FlateDecode.

Parameters:
in - the input data
strict - true to read a correct stream. false to try to read a corrupted stream
Returns:
the decoded data

ASCIIHexDecode

public static byte[] ASCIIHexDecode(byte[] in)
Decodes a stream that has the ASCIIHexDecode filter.

Parameters:
in - the input data
Returns:
the decoded data

ASCII85Decode

public static byte[] ASCII85Decode(byte[] in)
Decodes a stream that has the ASCII85Decode filter.

Parameters:
in - the input data
Returns:
the decoded data

LZWDecode

public static byte[] LZWDecode(byte[] in)
Decodes a stream that has the LZWDecode filter.

Parameters:
in - the input data
Returns:
the decoded data

isRebuilt

public boolean isRebuilt()
Checks if the document had errors and was rebuilt.

Returns:
true if rebuilt.

getPageN

public PdfDictionary getPageN(int pageNum)
Gets the dictionary that represents a page.

Parameters:
pageNum - the page number. 1 is the first
Returns:
the page dictionary

getPageOrigRef

public PRIndirectReference getPageOrigRef(int pageNum)
Gets the page reference to this page.

Parameters:
pageNum - the page number. 1 is the first
Returns:
the page reference

getPageContent

public byte[] getPageContent(int pageNum,
                             RandomAccessFileOrArray file)
                      throws IOException
Gets the contents of the page.

Parameters:
pageNum - the page number. 1 is the first
file - the location of the PDF document
Returns:
the content
Throws:
IOException - on error

killXref

protected void killXref(PdfObject obj)

setPageContent

public void setPageContent(int pageNum,
                           byte[] content)
                    throws IOException
Sets the contents of the page.

Parameters:
content - the new page content
pageNum - the page number. 1 is the first
Throws:
IOException - on error

getStreamBytes

public static byte[] getStreamBytes(PRStream stream,
                                    RandomAccessFileOrArray file)
                             throws IOException
Get the content from a stream.

Parameters:
stream - the stream
file - the location where the stream is
Returns:
the stream content
Throws:
IOException - on error

eliminateSharedStreams

public void eliminateSharedStreams()
Eliminates shared streams if they exist.


isTampered

public boolean isTampered()
Checks if the document was changed.

Returns:
true if the document was changed, false otherwise

setTampered

public void setTampered(boolean tampered)

getMetadata

public byte[] getMetadata()
                   throws IOException
Gets the XML metadata.

Returns:
the XML metadata
Throws:
IOException - on error

getLastXref

public int getLastXref()

getXrefSize

public int getXrefSize()

getEofPos

public int getEofPos()

getPdfVersion

public char getPdfVersion()

isEncrypted

public boolean isEncrypted()

getPermissions

public int getPermissions()

is128Key

public boolean is128Key()

getTrailer

public PdfDictionary getTrailer()

getDecrypt

PdfEncryption getDecrypt()

equalsn

static boolean equalsn(byte[] a1,
                       byte[] a2)

existsName

static boolean existsName(PdfDictionary dic,
                          PdfName key,
                          PdfName value)

getFontName

static String getFontName(PdfDictionary dic)

getSubsetPrefix

static String getSubsetPrefix(PdfDictionary dic)

shuffleSubsetNames

public int shuffleSubsetNames()
Finds all the font subsets and changes the prefixes to some random values.

Returns:
the number of font subsets altered

createFakeFontSubsets

public int createFakeFontSubsets()
Finds all the fonts not subset but embedded and marks them as subset.

Returns:
the number of fonts altered

getNameArray

private static PdfArray getNameArray(PdfObject obj)

getNamedDestination

public HashMap getNamedDestination()

getNamedDestinationFromNames

public HashMap getNamedDestinationFromNames()

getNamedDestinationFromStrings

public HashMap getNamedDestinationFromStrings()

replaceNamedDestination

private static void replaceNamedDestination(PdfObject obj,
                                            HashMap names)

removeFields

public void removeFields()
Removes all the fields from the document.


removeAnnotations

public void removeAnnotations()
Removes all the annotations and fields from the document.


iterateBookmarks

private void iterateBookmarks(PdfDictionary outline,
                              HashMap names)

consolidateNamedDestinations

public void consolidateNamedDestinations()
Replaces all the local named links with the actual destinations.


duplicatePdfDictionary

protected static PdfDictionary duplicatePdfDictionary(PdfDictionary original,
                                                      PdfDictionary copy,
                                                      PdfReader newReader)

duplicatePdfObject

protected static PdfObject duplicatePdfObject(PdfObject original,
                                              PdfReader newReader)

removeUnusedNode

protected void removeUnusedNode(PdfObject obj,
                                boolean[] hits)

removeUnusedObjects

public int removeUnusedObjects()
Removes all the unreachable objects.

Returns:
the number of indirect objects removed

getAcroFields

public AcroFields getAcroFields()
Gets a read-only version of AcroFields.

Returns:
a read-only version of AcroFields