LeenO computo metrico con LibreOffice  3.22.0
Il software libero per la gestione di computi metrici e contabilità lavori.
pdf.py
Vai alla documentazione di questo file.
1 # -*- coding: utf-8 -*-
2 #
3 # vim: sw=4:expandtab:foldmethod=marker
4 #
5 # Copyright (c) 2006, Mathieu Fenniak
6 # Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
7 #
8 # All rights reserved.
9 #
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
12 # met:
13 #
14 # * Redistributions of source code must retain the above copyright notice,
15 # this list of conditions and the following disclaimer.
16 # * Redistributions in binary form must reproduce the above copyright notice,
17 # this list of conditions and the following disclaimer in the documentation
18 # and/or other materials provided with the distribution.
19 # * The name of the author may not be used to endorse or promote products
20 # derived from this software without specific prior written permission.
21 #
22 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
23 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 # POSSIBILITY OF SUCH DAMAGE.
33 
34 """
35 A pure-Python PDF library with an increasing number of capabilities.
36 See README for links to FAQ, documentation, homepage, etc.
37 """
38 
39 __author__ = "Mathieu Fenniak"
40 __author_email__ = "biziqe@mathieu.fenniak.net"
41 
42 __maintainer__ = "Phaseit, Inc."
43 __maintainer_email = "PyPDF2@phaseit.net"
44 
45 import string
46 import math
47 import struct
48 import sys
49 import uuid
50 from sys import version_info
51 if version_info < ( 3, 0 ):
52  from cStringIO import StringIO
53 else:
54  from io import StringIO
55 
56 if version_info < ( 3, 0 ):
57  BytesIO = StringIO
58 else:
59  from io import BytesIO
60 
61 from . import filters
62 from . import utils
63 import warnings
64 import codecs
65 from .generic import *
66 from .utils import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
67 from .utils import isString, b_, u_, ord_, chr_, str_, formatWarning
68 
69 if version_info < ( 2, 4 ):
70  from sets import ImmutableSet as frozenset
71 
72 if version_info < ( 2, 5 ):
73  from md5 import md5
74 else:
75  from hashlib import md5
76 import uuid
77 
78 
79 class PdfFileWriter(object):
80  """
81  This class supports writing PDF files out, given pages produced by another
82  class (typically :class:`PdfFileReader<PdfFileReader>`).
83  """
84  def __init__(self):
85  self._header = b_("%PDF-1.3")
86  self._objects = [] # array of indirect objects
87 
88  # The root of our page tree node.
89  pages = DictionaryObject()
90  pages.update({
91  NameObject("/Type"): NameObject("/Pages"),
92  NameObject("/Count"): NumberObject(0),
93  NameObject("/Kids"): ArrayObject(),
94  })
95  self._pages = self._addObject(pages)
96 
97  # info object
98  info = DictionaryObject()
99  info.update({
100  NameObject("/Producer"): createStringObject(codecs.BOM_UTF16_BE + u_("PyPDF2").encode('utf-16be'))
101  })
102  self._info = self._addObject(info)
103 
104  # root object
105  root = DictionaryObject()
106  root.update({
107  NameObject("/Type"): NameObject("/Catalog"),
108  NameObject("/Pages"): self._pages,
109  })
110  self._root = None
111  self._root_object = root
112 
113  def _addObject(self, obj):
114  self._objects.append(obj)
115  return IndirectObject(len(self._objects), 0, self)
116 
117  def getObject(self, ido):
118  if ido.pdf != self:
119  raise ValueError("pdf must be self")
120  return self._objects[ido.idnum - 1]
121 
122  def _addPage(self, page, action):
123  assert page["/Type"] == "/Page"
124  page[NameObject("/Parent")] = self._pages
125  page = self._addObject(page)
126  pages = self.getObject(self._pages)
127  action(pages["/Kids"], page)
128  pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1)
129 
130  def addPage(self, page):
131  """
132  Adds a page to this PDF file. The page is usually acquired from a
133  :class:`PdfFileReader<PdfFileReader>` instance.
134 
135  :param PageObject page: The page to add to the document. Should be
136  an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
137  """
138  self._addPage(page, list.append)
139 
140  def insertPage(self, page, index=0):
141  """
142  Insert a page in this PDF file. The page is usually acquired from a
143  :class:`PdfFileReader<PdfFileReader>` instance.
144 
145  :param PageObject page: The page to add to the document. This
146  argument should be an instance of :class:`PageObject<pdf.PageObject>`.
147  :param int index: Position at which the page will be inserted.
148  """
149  self._addPage(page, lambda l, p: l.insert(index, p))
150 
151  def getPage(self, pageNumber):
152  """
153  Retrieves a page by number from this PDF file.
154 
155  :param int pageNumber: The page number to retrieve
156  (pages begin at zero)
157  :return: the page at the index given by *pageNumber*
158  :rtype: :class:`PageObject<pdf.PageObject>`
159  """
160  pages = self.getObject(self._pages)
161  # XXX: crude hack
162  return pages["/Kids"][pageNumber].getObject()
163 
164  def getNumPages(self):
165  """
166  :return: the number of pages.
167  :rtype: int
168  """
169  pages = self.getObject(self._pages)
170  return int(pages[NameObject("/Count")])
171 
172  def addBlankPage(self, width=None, height=None):
173  """
174  Appends a blank page to this PDF file and returns it. If no page size
175  is specified, use the size of the last page.
176 
177  :param float width: The width of the new page expressed in default user
178  space units.
179  :param float height: The height of the new page expressed in default
180  user space units.
181  :return: the newly appended page
182  :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
183  :raises PageSizeNotDefinedError: if width and height are not defined
184  and previous page does not exist.
185  """
186  page = PageObject.createBlankPage(self, width, height)
187  self.addPage(page)
188  return page
189 
190  def insertBlankPage(self, width=None, height=None, index=0):
191  """
192  Inserts a blank page to this PDF file and returns it. If no page size
193  is specified, use the size of the last page.
194 
195  :param float width: The width of the new page expressed in default user
196  space units.
197  :param float height: The height of the new page expressed in default
198  user space units.
199  :param int index: Position to add the page.
200  :return: the newly appended page
201  :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
202  :raises PageSizeNotDefinedError: if width and height are not defined
203  and previous page does not exist.
204  """
205  if width is None or height is None and \
206  (self.getNumPages() - 1) >= index:
207  oldpage = self.getPage(index)
208  width = oldpage.mediaBox.getWidth()
209  height = oldpage.mediaBox.getHeight()
210  page = PageObject.createBlankPage(self, width, height)
211  self.insertPage(page, index)
212  return page
213 
214  def addJS(self, javascript):
215  """
216  Add Javascript which will launch upon opening this PDF.
217 
218  :param str javascript: Your Javascript.
219 
220  >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
221  # Example: This will launch the print window when the PDF is opened.
222  """
223  js = DictionaryObject()
224  js.update({
225  NameObject("/Type"): NameObject("/Action"),
226  NameObject("/S"): NameObject("/JavaScript"),
227  NameObject("/JS"): NameObject("(%s)" % javascript)
228  })
229  js_indirect_object = self._addObject(js)
230 
231  # We need a name for parameterized javascript in the pdf file, but it can be anything.
232  js_string_name = str(uuid.uuid4())
233 
234  js_name_tree = DictionaryObject()
235  js_name_tree.update({
236  NameObject("/JavaScript"): DictionaryObject({
237  NameObject("/Names"): ArrayObject([createStringObject(js_string_name), js_indirect_object])
238  })
239  })
240  self._addObject(js_name_tree)
241 
242  self._root_object.update({
243  NameObject("/OpenAction"): js_indirect_object,
244  NameObject("/Names"): js_name_tree
245  })
246 
247  def addAttachment(self, fname, fdata):
248  """
249  Embed a file inside the PDF.
250 
251  :param str fname: The filename to display.
252  :param str fdata: The data in the file.
253 
254  Reference:
255  https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
256  Section 7.11.3
257  """
258 
259  # We need 3 entries:
260  # * The file's data
261  # * The /Filespec entry
262  # * The file's name, which goes in the Catalog
263 
264 
265  # The entry for the file
266  """ Sample:
267  8 0 obj
268  <<
269  /Length 12
270  /Type /EmbeddedFile
271  >>
272  stream
273  Hello world!
274  endstream
275  endobj
276  """
277  file_entry = DecodedStreamObject()
278  file_entry.setData(fdata)
279  file_entry.update({
280  NameObject("/Type"): NameObject("/EmbeddedFile")
281  })
282 
283  # The Filespec entry
284  """ Sample:
285  7 0 obj
286  <<
287  /Type /Filespec
288  /F (hello.txt)
289  /EF << /F 8 0 R >>
290  >>
291  """
292  efEntry = DictionaryObject()
293  efEntry.update({ NameObject("/F"):file_entry })
294 
295  filespec = DictionaryObject()
296  filespec.update({
297  NameObject("/Type"): NameObject("/Filespec"),
298  NameObject("/F"): createStringObject(fname), # Perhaps also try TextStringObject
299  NameObject("/EF"): efEntry
300  })
301 
302  # Then create the entry for the root, as it needs a reference to the Filespec
303  """ Sample:
304  1 0 obj
305  <<
306  /Type /Catalog
307  /Outlines 2 0 R
308  /Pages 3 0 R
309  /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
310  >>
311  endobj
312 
313  """
314  embeddedFilesNamesDictionary = DictionaryObject()
315  embeddedFilesNamesDictionary.update({
316  NameObject("/Names"): ArrayObject([createStringObject(fname), filespec])
317  })
318 
319  embeddedFilesDictionary = DictionaryObject()
320  embeddedFilesDictionary.update({
321  NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary
322  })
323  # Update the root
324  self._root_object.update({
325  NameObject("/Names"): embeddedFilesDictionary
326  })
327 
328  def appendPagesFromReader(self, reader, after_page_append=None):
329  """
330  Copy pages from reader to writer. Includes an optional callback parameter
331  which is invoked after pages are appended to the writer.
332 
333  :param reader: a PdfFileReader object from which to copy page
334  annotations to this writer object. The writer's annots
335  will then be updated
336  :callback after_page_append (function): Callback function that is invoked after
337  each page is appended to the writer. Callback signature:
338 
339  :param writer_pageref (PDF page reference): Reference to the page
340  appended to the writer.
341  """
342  # Get page count from writer and reader
343  reader_num_pages = reader.getNumPages()
344  writer_num_pages = self.getNumPages()
345 
346  # Copy pages from reader to writer
347  for rpagenum in range(0, reader_num_pages):
348  reader_page = reader.getPage(rpagenum)
349  self.addPage(reader_page)
350  writer_page = self.getPage(writer_num_pages+rpagenum)
351  # Trigger callback, pass writer page as parameter
352  if callable(after_page_append): after_page_append(writer_page)
353 
354  def updatePageFormFieldValues(self, page, fields):
355  '''
356  Update the form field values for a given page from a fields dictionary.
357  Copy field texts and values from fields to page.
358 
359  :param page: Page reference from PDF writer where the annotations
360  and field data will be updated.
361  :param fields: a Python dictionary of field names (/T) and text
362  values (/V)
363  '''
364  # Iterate through pages, update field values
365  for j in range(0, len(page['/Annots'])):
366  writer_annot = page['/Annots'][j].getObject()
367  for field in fields:
368  if writer_annot.get('/T') == field:
369  writer_annot.update({
370  NameObject("/V"): TextStringObject(fields[field])
371  })
372 
373  def cloneReaderDocumentRoot(self, reader):
374  '''
375  Copy the reader document root to the writer.
376 
377  :param reader: PdfFileReader from the document root should be copied.
378  :callback after_page_append
379  '''
380  self._root_object = reader.trailer['/Root']
381 
382  def cloneDocumentFromReader(self, reader, after_page_append=None):
383  '''
384  Create a copy (clone) of a document from a PDF file reader
385 
386  :param reader: PDF file reader instance from which the clone
387  should be created.
388  :callback after_page_append (function): Callback function that is invoked after
389  each page is appended to the writer. Signature includes a reference to the
390  appended page (delegates to appendPagesFromReader). Callback signature:
391 
392  :param writer_pageref (PDF page reference): Reference to the page just
393  appended to the document.
394  '''
395  self.cloneReaderDocumentRoot(reader)
396  self.appendPagesFromReader(reader, after_page_append)
397 
398  def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
399  """
400  Encrypt this PDF file with the PDF Standard encryption handler.
401 
402  :param str user_pwd: The "user password", which allows for opening
403  and reading the PDF file with the restrictions provided.
404  :param str owner_pwd: The "owner password", which allows for
405  opening the PDF files without any restrictions. By default,
406  the owner password is the same as the user password.
407  :param bool use_128bit: flag as to whether to use 128bit
408  encryption. When false, 40bit encryption will be used. By default,
409  this flag is on.
410  """
411  import time, random
412  if owner_pwd == None:
413  owner_pwd = user_pwd
414  if use_128bit:
415  V = 2
416  rev = 3
417  keylen = int(128 / 8)
418  else:
419  V = 1
420  rev = 2
421  keylen = int(40 / 8)
422  # permit everything:
423  P = -1
424  O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
425  ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
426  ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
427  self._ID = ArrayObject((ID_1, ID_2))
428  if rev == 2:
429  U, key = _alg34(user_pwd, O, P, ID_1)
430  else:
431  assert rev == 3
432  U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1, False)
433  encrypt = DictionaryObject()
434  encrypt[NameObject("/Filter")] = NameObject("/Standard")
435  encrypt[NameObject("/V")] = NumberObject(V)
436  if V == 2:
437  encrypt[NameObject("/Length")] = NumberObject(keylen * 8)
438  encrypt[NameObject("/R")] = NumberObject(rev)
439  encrypt[NameObject("/O")] = ByteStringObject(O)
440  encrypt[NameObject("/U")] = ByteStringObject(U)
441  encrypt[NameObject("/P")] = NumberObject(P)
442  self._encrypt = self._addObject(encrypt)
443  self._encrypt_key = key
444 
445  def write(self, stream):
446  """
447  Writes the collection of pages added to this object out as a PDF file.
448 
449  :param stream: An object to write the file to. The object must support
450  the write method and the tell method, similar to a file object.
451  """
452  if hasattr(stream, 'mode') and 'b' not in stream.mode:
453  warnings.warn("File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name)
454  debug = False
455  import struct
456 
457  if not self._root:
458  self._root = self._addObject(self._root_object)
459 
460  externalReferenceMap = {}
461 
462  # PDF objects sometimes have circular references to their /Page objects
463  # inside their object tree (for example, annotations). Those will be
464  # indirect references to objects that we've recreated in this PDF. To
465  # address this problem, PageObject's store their original object
466  # reference number, and we add it to the external reference map before
467  # we sweep for indirect references. This forces self-page-referencing
468  # trees to reference the correct new object location, rather than
469  # copying in a new copy of the page object.
470  for objIndex in range(len(self._objects)):
471  obj = self._objects[objIndex]
472  if isinstance(obj, PageObject) and obj.indirectRef != None:
473  data = obj.indirectRef
474  if data.pdf not in externalReferenceMap:
475  externalReferenceMap[data.pdf] = {}
476  if data.generation not in externalReferenceMap[data.pdf]:
477  externalReferenceMap[data.pdf][data.generation] = {}
478  externalReferenceMap[data.pdf][data.generation][data.idnum] = IndirectObject(objIndex + 1, 0, self)
479 
480  self.stack = []
481  if debug: print(("ERM:", externalReferenceMap, "root:", self._root))
482  self._sweepIndirectReferences(externalReferenceMap, self._root)
483  del self.stack
484 
485  # Begin writing:
486  object_positions = []
487  stream.write(self._header + b_("\n"))
488  for i in range(len(self._objects)):
489  idnum = (i + 1)
490  obj = self._objects[i]
491  object_positions.append(stream.tell())
492  stream.write(b_(str(idnum) + " 0 obj\n"))
493  key = None
494  if hasattr(self, "_encrypt") and idnum != self._encrypt.idnum:
495  pack1 = struct.pack("<i", i + 1)[:3]
496  pack2 = struct.pack("<i", 0)[:2]
497  key = self._encrypt_key + pack1 + pack2
498  assert len(key) == (len(self._encrypt_key) + 5)
499  md5_hash = md5(key).digest()
500  key = md5_hash[:min(16, len(self._encrypt_key) + 5)]
501  obj.writeToStream(stream, key)
502  stream.write(b_("\nendobj\n"))
503 
504  # xref table
505  xref_location = stream.tell()
506  stream.write(b_("xref\n"))
507  stream.write(b_("0 %s\n" % (len(self._objects) + 1)))
508  stream.write(b_("%010d %05d f \n" % (0, 65535)))
509  for offset in object_positions:
510  stream.write(b_("%010d %05d n \n" % (offset, 0)))
511 
512  # trailer
513  stream.write(b_("trailer\n"))
514  trailer = DictionaryObject()
515  trailer.update({
516  NameObject("/Size"): NumberObject(len(self._objects) + 1),
517  NameObject("/Root"): self._root,
518  NameObject("/Info"): self._info,
519  })
520  if hasattr(self, "_ID"):
521  trailer[NameObject("/ID")] = self._ID
522  if hasattr(self, "_encrypt"):
523  trailer[NameObject("/Encrypt")] = self._encrypt
524  trailer.writeToStream(stream, None)
525 
526  # eof
527  stream.write(b_("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
528 
529  def addMetadata(self, infos):
530  """
531  Add custom metadata to the output.
532 
533  :param dict infos: a Python dictionary where each key is a field
534  and each value is your new metadata.
535  """
536  args = {}
537  for key, value in list(infos.items()):
538  args[NameObject(key)] = createStringObject(value)
539  self.getObject(self._info).update(args)
540 
541  def _sweepIndirectReferences(self, externMap, data):
542  debug = False
543  if debug: print((data, "TYPE", data.__class__.__name__))
544  if isinstance(data, DictionaryObject):
545  for key, value in list(data.items()):
546  origvalue = value
547  value = self._sweepIndirectReferences(externMap, value)
548  if isinstance(value, StreamObject):
549  # a dictionary value is a stream. streams must be indirect
550  # objects, so we need to change this value.
551  value = self._addObject(value)
552  data[key] = value
553  return data
554  elif isinstance(data, ArrayObject):
555  for i in range(len(data)):
556  value = self._sweepIndirectReferences(externMap, data[i])
557  if isinstance(value, StreamObject):
558  # an array value is a stream. streams must be indirect
559  # objects, so we need to change this value
560  value = self._addObject(value)
561  data[i] = value
562  return data
563  elif isinstance(data, IndirectObject):
564  # internal indirect references are fine
565  if data.pdf == self:
566  if data.idnum in self.stack:
567  return data
568  else:
569  self.stack.append(data.idnum)
570  realdata = self.getObject(data)
571  self._sweepIndirectReferences(externMap, realdata)
572  return data
573  else:
574  newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum, None)
575  if newobj == None:
576  try:
577  newobj = data.pdf.getObject(data)
578  self._objects.append(None) # placeholder
579  idnum = len(self._objects)
580  newobj_ido = IndirectObject(idnum, 0, self)
581  if data.pdf not in externMap:
582  externMap[data.pdf] = {}
583  if data.generation not in externMap[data.pdf]:
584  externMap[data.pdf][data.generation] = {}
585  externMap[data.pdf][data.generation][data.idnum] = newobj_ido
586  newobj = self._sweepIndirectReferences(externMap, newobj)
587  self._objects[idnum-1] = newobj
588  return newobj_ido
589  except ValueError:
590  # Unable to resolve the Object, returning NullObject instead.
591  return NullObject()
592  return newobj
593  else:
594  return data
595 
596  def getReference(self, obj):
597  idnum = self._objects.index(obj) + 1
598  ref = IndirectObject(idnum, 0, self)
599  assert ref.getObject() == obj
600  return ref
601 
602  def getOutlineRoot(self):
603  if '/Outlines' in self._root_object:
604  outline = self._root_object['/Outlines']
605  idnum = self._objects.index(outline) + 1
606  outlineRef = IndirectObject(idnum, 0, self)
607  assert outlineRef.getObject() == outline
608  else:
609  outline = TreeObject()
610  outline.update({ })
611  outlineRef = self._addObject(outline)
612  self._root_object[NameObject('/Outlines')] = outlineRef
613 
614  return outline
615 
616  def getNamedDestRoot(self):
617  if '/Names' in self._root_object and isinstance(self._root_object['/Names'], DictionaryObject):
618  names = self._root_object['/Names']
619  idnum = self._objects.index(names) + 1
620  namesRef = IndirectObject(idnum, 0, self)
621  assert namesRef.getObject() == names
622  if '/Dests' in names and isinstance(names['/Dests'], DictionaryObject):
623  dests = names['/Dests']
624  idnum = self._objects.index(dests) + 1
625  destsRef = IndirectObject(idnum, 0, self)
626  assert destsRef.getObject() == dests
627  if '/Names' in dests:
628  nd = dests['/Names']
629  else:
630  nd = ArrayObject()
631  dests[NameObject('/Names')] = nd
632  else:
633  dests = DictionaryObject()
634  destsRef = self._addObject(dests)
635  names[NameObject('/Dests')] = destsRef
636  nd = ArrayObject()
637  dests[NameObject('/Names')] = nd
638 
639  else:
640  names = DictionaryObject()
641  namesRef = self._addObject(names)
642  self._root_object[NameObject('/Names')] = namesRef
643  dests = DictionaryObject()
644  destsRef = self._addObject(dests)
645  names[NameObject('/Dests')] = destsRef
646  nd = ArrayObject()
647  dests[NameObject('/Names')] = nd
648 
649  return nd
650 
651  def addBookmarkDestination(self, dest, parent=None):
652  destRef = self._addObject(dest)
653 
654  outlineRef = self.getOutlineRoot()
655 
656  if parent == None:
657  parent = outlineRef
658 
659  parent = parent.getObject()
660  #print parent.__class__.__name__
661  parent.addChild(destRef, self)
662 
663  return destRef
664 
665  def addBookmarkDict(self, bookmark, parent=None):
666  bookmarkObj = TreeObject()
667  for k, v in list(bookmark.items()):
668  bookmarkObj[NameObject(str(k))] = v
669  bookmarkObj.update(bookmark)
670 
671  if '/A' in bookmark:
672  action = DictionaryObject()
673  for k, v in list(bookmark['/A'].items()):
674  action[NameObject(str(k))] = v
675  actionRef = self._addObject(action)
676  bookmarkObj[NameObject('/A')] = actionRef
677 
678  bookmarkRef = self._addObject(bookmarkObj)
679 
680  outlineRef = self.getOutlineRoot()
681 
682  if parent == None:
683  parent = outlineRef
684 
685  parent = parent.getObject()
686  parent.addChild(bookmarkRef, self)
687 
688  return bookmarkRef
689 
690  def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
691  """
692  Add a bookmark to this PDF file.
693 
694  :param str title: Title to use for this bookmark.
695  :param int pagenum: Page number this bookmark will point to.
696  :param parent: A reference to a parent bookmark to create nested
697  bookmarks.
698  :param tuple color: Color of the bookmark as a red, green, blue tuple
699  from 0.0 to 1.0
700  :param bool bold: Bookmark is bold
701  :param bool italic: Bookmark is italic
702  :param str fit: The fit of the destination page. See
703  :meth:`addLink()<addLink>` for details.
704  """
705  pageRef = self.getObject(self._pages)['/Kids'][pagenum]
706  action = DictionaryObject()
707  zoomArgs = []
708  for a in args:
709  if a is not None:
710  zoomArgs.append(NumberObject(a))
711  else:
712  zoomArgs.append(NullObject())
713  dest = Destination(NameObject("/"+title + " bookmark"), pageRef, NameObject(fit), *zoomArgs)
714  destArray = dest.getDestArray()
715  action.update({
716  NameObject('/D') : destArray,
717  NameObject('/S') : NameObject('/GoTo')
718  })
719  actionRef = self._addObject(action)
720 
721  outlineRef = self.getOutlineRoot()
722 
723  if parent == None:
724  parent = outlineRef
725 
726  bookmark = TreeObject()
727 
728  bookmark.update({
729  NameObject('/A'): actionRef,
730  NameObject('/Title'): createStringObject(title),
731  })
732 
733  if color is not None:
734  bookmark.update({NameObject('/C'): ArrayObject([FloatObject(c) for c in color])})
735 
736  format = 0
737  if italic:
738  format += 1
739  if bold:
740  format += 2
741  if format:
742  bookmark.update({NameObject('/F'): NumberObject(format)})
743 
744  bookmarkRef = self._addObject(bookmark)
745 
746  parent = parent.getObject()
747  parent.addChild(bookmarkRef, self)
748 
749  return bookmarkRef
750 
751  def addNamedDestinationObject(self, dest):
752  destRef = self._addObject(dest)
753 
754  nd = self.getNamedDestRoot()
755  nd.extend([dest['/Title'], destRef])
756 
757  return destRef
758 
759  def addNamedDestination(self, title, pagenum):
760  pageRef = self.getObject(self._pages)['/Kids'][pagenum]
761  dest = DictionaryObject()
762  dest.update({
763  NameObject('/D') : ArrayObject([pageRef, NameObject('/FitH'), NumberObject(826)]),
764  NameObject('/S') : NameObject('/GoTo')
765  })
766 
767  destRef = self._addObject(dest)
768  nd = self.getNamedDestRoot()
769 
770  nd.extend([title, destRef])
771 
772  return destRef
773 
774  def removeLinks(self):
775  """
776  Removes links and annotations from this output.
777  """
778  pages = self.getObject(self._pages)['/Kids']
779  for page in pages:
780  pageRef = self.getObject(page)
781  if "/Annots" in pageRef:
782  del pageRef['/Annots']
783 
784  def removeImages(self, ignoreByteStringObject=False):
785  """
786  Removes images from this output.
787 
788  :param bool ignoreByteStringObject: optional parameter
789  to ignore ByteString Objects.
790  """
791  pages = self.getObject(self._pages)['/Kids']
792  for j in range(len(pages)):
793  page = pages[j]
794  pageRef = self.getObject(page)
795  content = pageRef['/Contents'].getObject()
796  if not isinstance(content, ContentStream):
797  content = ContentStream(content, pageRef)
798 
799  _operations = []
800  seq_graphics = False
801  for operands, operator in content.operations:
802  if operator == b_('Tj'):
803  text = operands[0]
804  if ignoreByteStringObject:
805  if not isinstance(text, TextStringObject):
806  operands[0] = TextStringObject()
807  elif operator == b_("'"):
808  text = operands[0]
809  if ignoreByteStringObject:
810  if not isinstance(text, TextStringObject):
811  operands[0] = TextStringObject()
812  elif operator == b_('"'):
813  text = operands[2]
814  if ignoreByteStringObject:
815  if not isinstance(text, TextStringObject):
816  operands[2] = TextStringObject()
817  elif operator == b_("TJ"):
818  for i in range(len(operands[0])):
819  if ignoreByteStringObject:
820  if not isinstance(operands[0][i], TextStringObject):
821  operands[0][i] = TextStringObject()
822 
823  if operator == b_('q'):
824  seq_graphics = True
825  if operator == b_('Q'):
826  seq_graphics = False
827  if seq_graphics:
828  if operator in [b_('cm'), b_('w'), b_('J'), b_('j'), b_('M'), b_('d'), b_('ri'), b_('i'),
829  b_('gs'), b_('W'), b_('b'), b_('s'), b_('S'), b_('f'), b_('F'), b_('n'), b_('m'), b_('l'),
830  b_('c'), b_('v'), b_('y'), b_('h'), b_('B'), b_('Do'), b_('sh')]:
831  continue
832  if operator == b_('re'):
833  continue
834  _operations.append((operands, operator))
835 
836  content.operations = _operations
837  pageRef.__setitem__(NameObject('/Contents'), content)
838 
839  def removeText(self, ignoreByteStringObject=False):
840  """
841  Removes images from this output.
842 
843  :param bool ignoreByteStringObject: optional parameter
844  to ignore ByteString Objects.
845  """
846  pages = self.getObject(self._pages)['/Kids']
847  for j in range(len(pages)):
848  page = pages[j]
849  pageRef = self.getObject(page)
850  content = pageRef['/Contents'].getObject()
851  if not isinstance(content, ContentStream):
852  content = ContentStream(content, pageRef)
853  for operands,operator in content.operations:
854  if operator == b_('Tj'):
855  text = operands[0]
856  if not ignoreByteStringObject:
857  if isinstance(text, TextStringObject):
858  operands[0] = TextStringObject()
859  else:
860  if isinstance(text, TextStringObject) or \
861  isinstance(text, ByteStringObject):
862  operands[0] = TextStringObject()
863  elif operator == b_("'"):
864  text = operands[0]
865  if not ignoreByteStringObject:
866  if isinstance(text, TextStringObject):
867  operands[0] = TextStringObject()
868  else:
869  if isinstance(text, TextStringObject) or \
870  isinstance(text, ByteStringObject):
871  operands[0] = TextStringObject()
872  elif operator == b_('"'):
873  text = operands[2]
874  if not ignoreByteStringObject:
875  if isinstance(text, TextStringObject):
876  operands[2] = TextStringObject()
877  else:
878  if isinstance(text, TextStringObject) or \
879  isinstance(text, ByteStringObject):
880  operands[2] = TextStringObject()
881  elif operator == b_("TJ"):
882  for i in range(len(operands[0])):
883  if not ignoreByteStringObject:
884  if isinstance(operands[0][i], TextStringObject):
885  operands[0][i] = TextStringObject()
886  else:
887  if isinstance(operands[0][i], TextStringObject) or \
888  isinstance(operands[0][i], ByteStringObject):
889  operands[0][i] = TextStringObject()
890 
891  pageRef.__setitem__(NameObject('/Contents'), content)
892 
893  def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args):
894  """
895  Add an internal link from a rectangular area to the specified page.
896 
897  :param int pagenum: index of the page on which to place the link.
898  :param int pagedest: index of the page to which the link should go.
899  :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
900  integers specifying the clickable rectangular area
901  ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
902  :param border: if provided, an array describing border-drawing
903  properties. See the PDF spec for details. No border will be
904  drawn if this argument is omitted.
905  :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need
906  to be supplied. Passing ``None`` will be read as a null value for that coordinate.
907 
908  Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details):
909  /Fit No additional arguments
910  /XYZ [left] [top] [zoomFactor]
911  /FitH [top]
912  /FitV [left]
913  /FitR [left] [bottom] [right] [top]
914  /FitB No additional arguments
915  /FitBH [top]
916  /FitBV [left]
917  """
918 
919  pageLink = self.getObject(self._pages)['/Kids'][pagenum]
920  pageDest = self.getObject(self._pages)['/Kids'][pagedest] #TODO: switch for external link
921  pageRef = self.getObject(pageLink)
922 
923  if border is not None:
924  borderArr = [NameObject(n) for n in border[:3]]
925  if len(border) == 4:
926  dashPattern = ArrayObject([NameObject(n) for n in border[3]])
927  borderArr.append(dashPattern)
928  else:
929  borderArr = [NumberObject(0)] * 3
930 
931  if isString(rect):
932  rect = NameObject(rect)
933  elif isinstance(rect, RectangleObject):
934  pass
935  else:
936  rect = RectangleObject(rect)
937 
938  zoomArgs = []
939  for a in args:
940  if a is not None:
941  zoomArgs.append(NumberObject(a))
942  else:
943  zoomArgs.append(NullObject())
944  dest = Destination(NameObject("/LinkName"), pageDest, NameObject(fit), *zoomArgs) #TODO: create a better name for the link
945  destArray = dest.getDestArray()
946 
947  lnk = DictionaryObject()
948  lnk.update({
949  NameObject('/Type'): NameObject('/Annot'),
950  NameObject('/Subtype'): NameObject('/Link'),
951  NameObject('/P'): pageLink,
952  NameObject('/Rect'): rect,
953  NameObject('/Border'): ArrayObject(borderArr),
954  NameObject('/Dest'): destArray
955  })
956  lnkRef = self._addObject(lnk)
957 
958  if "/Annots" in pageRef:
959  pageRef['/Annots'].append(lnkRef)
960  else:
961  pageRef[NameObject('/Annots')] = ArrayObject([lnkRef])
962 
963  _valid_layouts = ['/NoLayout', '/SinglePage', '/OneColumn', '/TwoColumnLeft', '/TwoColumnRight', '/TwoPageLeft', '/TwoPageRight']
964 
965  def getPageLayout(self):
966  """
967  Get the page layout.
968  See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts.
969 
970  :return: Page layout currently being used.
971  :rtype: str, None if not specified
972  """
973  try:
974  return self._root_object['/PageLayout']
975  except KeyError:
976  return None
977 
978  def setPageLayout(self, layout):
979  """
980  Set the page layout
981 
982  :param str layout: The page layout to be used
983 
984  Valid layouts are:
985  /NoLayout Layout explicitly not specified
986  /SinglePage Show one page at a time
987  /OneColumn Show one column at a time
988  /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
989  /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
990  /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
991  /TwoPageRight Show two pages at a time, odd-numbered pages on the right
992  """
993  if not isinstance(layout, NameObject):
994  if layout not in self._valid_layouts:
995  warnings.warn("Layout should be one of: {}".format(', '.join(self._valid_layouts)))
996  layout = NameObject(layout)
997  self._root_object.update({NameObject('/PageLayout'): layout})
998 
999  pageLayout = property(getPageLayout, setPageLayout)
1000  """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>`
1001  and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods."""
1002 
1003  _valid_modes = ['/UseNone', '/UseOutlines', '/UseThumbs', '/FullScreen', '/UseOC', '/UseAttachments']
1004 
1005  def getPageMode(self):
1006  """
1007  Get the page mode.
1008  See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description
1009  of valid modes.
1010 
1011  :return: Page mode currently being used.
1012  :rtype: str, None if not specified
1013  """
1014  try:
1015  return self._root_object['/PageMode']
1016  except KeyError:
1017  return None
1018 
1019  def setPageMode(self, mode):
1020  """
1021  Set the page mode.
1022 
1023  :param str mode: The page mode to use.
1024 
1025  Valid modes are:
1026  /UseNone Do not show outlines or thumbnails panels
1027  /UseOutlines Show outlines (aka bookmarks) panel
1028  /UseThumbs Show page thumbnails panel
1029  /FullScreen Fullscreen view
1030  /UseOC Show Optional Content Group (OCG) panel
1031  /UseAttachments Show attachments panel
1032  """
1033  if not isinstance(mode, NameObject):
1034  if mode not in self._valid_modes:
1035  warnings.warn("Mode should be one of: {}".format(', '.join(self._valid_modes)))
1036  mode = NameObject(mode)
1037  self._root_object.update({NameObject('/PageMode'): mode})
1038 
1039  pageMode = property(getPageMode, setPageMode)
1040  """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
1041  and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
1042 
1043 
1044 class PdfFileReader(object):
1045  """
1046  Initializes a PdfFileReader object. This operation can take some time, as
1047  the PDF stream's cross-reference tables are read into memory.
1048 
1049  :param stream: A File object or an object that supports the standard read
1050  and seek methods similar to a File object. Could also be a
1051  string representing a path to a PDF file.
1052  :param bool strict: Determines whether user should be warned of all
1053  problems and also causes some correctable problems to be fatal.
1054  Defaults to ``True``.
1055  :param warndest: Destination for logging warnings (defaults to
1056  ``sys.stderr``).
1057  :param bool overwriteWarnings: Determines whether to override Python's
1058  ``warnings.py`` module with a custom implementation (defaults to
1059  ``True``).
1060  """
1061  def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True):
1062  if overwriteWarnings:
1063  # have to dynamically override the default showwarning since there are no
1064  # public methods that specify the 'file' parameter
1065  def _showwarning(message, category, filename, lineno, file=warndest, line=None):
1066  if file is None:
1067  file = sys.stderr
1068  try:
1069  file.write(formatWarning(message, category, filename, lineno, line))
1070  except IOError:
1071  pass
1072  warnings.showwarning = _showwarning
1073  self.strict = strict
1074  self.flattenedPages = None
1076  self.xrefIndex = 0
1077  self._pageId2Num = None # map page IndirectRef number to Page Number
1078  if hasattr(stream, 'mode') and 'b' not in stream.mode:
1079  warnings.warn("PdfFileReader stream/file object is not in binary mode. It may not be read correctly.", utils.PdfReadWarning)
1080  if isString(stream):
1081  fileobj = open(stream, 'rb')
1082  stream = BytesIO(b_(fileobj.read()))
1083  fileobj.close()
1084  self.read(stream)
1085  self.stream = stream
1086 
1087  self._override_encryption = False
1088 
1089  def getDocumentInfo(self):
1090  """
1091  Retrieves the PDF file's document information dictionary, if it exists.
1092  Note that some PDF files use metadata streams instead of docinfo
1093  dictionaries, and these metadata streams will not be accessed by this
1094  function.
1095 
1096  :return: the document information of this PDF file
1097  :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists.
1098  """
1099  if "/Info" not in self.trailer:
1100  return None
1101  obj = self.trailer['/Info']
1102  retval = DocumentInformation()
1103  retval.update(obj)
1104  return retval
1105 
1106  documentInfo = property(lambda self: self.getDocumentInfo(), None, None)
1107  """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function."""
1108 
1109  def getXmpMetadata(self):
1110  """
1111  Retrieves XMP (Extensible Metadata Platform) data from the PDF document
1112  root.
1113 
1114  :return: a :class:`XmpInformation<xmp.XmpInformation>`
1115  instance that can be used to access XMP metadata from the document.
1116  :rtype: :class:`XmpInformation<xmp.XmpInformation>` or
1117  ``None`` if no metadata was found on the document root.
1118  """
1119  try:
1120  self._override_encryption = True
1121  return self.trailer["/Root"].getXmpMetadata()
1122  finally:
1123  self._override_encryption = False
1124 
1125  xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None)
1126  """
1127  Read-only property that accesses the
1128  :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function.
1129  """
1130 
1131  def getNumPages(self):
1132  """
1133  Calculates the number of pages in this PDF file.
1134 
1135  :return: number of pages
1136  :rtype: int
1137  :raises PdfReadError: if file is encrypted and restrictions prevent
1138  this action.
1139  """
1140 
1141  # Flattened pages will not work on an Encrypted PDF;
1142  # the PDF file's page count is used in this case. Otherwise,
1143  # the original method (flattened page count) is used.
1144  if self.isEncrypted:
1145  try:
1146  self._override_encryption = True
1147  self.decrypt('')
1148  return self.trailer["/Root"]["/Pages"]["/Count"]
1149  except:
1150  raise utils.PdfReadError("File has not been decrypted")
1151  finally:
1152  self._override_encryption = False
1153  else:
1154  if self.flattenedPages == None:
1155  self._flatten()
1156  return len(self.flattenedPages)
1157 
1158  numPages = property(lambda self: self.getNumPages(), None, None)
1159  """
1160  Read-only property that accesses the
1161  :meth:`getNumPages()<PdfFileReader.getNumPages>` function.
1162  """
1163 
1164  def getPage(self, pageNumber):
1165  """
1166  Retrieves a page by number from this PDF file.
1167 
1168  :param int pageNumber: The page number to retrieve
1169  (pages begin at zero)
1170  :return: a :class:`PageObject<pdf.PageObject>` instance.
1171  :rtype: :class:`PageObject<pdf.PageObject>`
1172  """
1173 
1175  if self.flattenedPages == None:
1176  self._flatten()
1177  return self.flattenedPages[pageNumber]
1178 
1179  namedDestinations = property(lambda self:
1180  self.getNamedDestinations(), None, None)
1181  """
1182  Read-only property that accesses the
1183  :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function.
1184  """
1185 
1186  # A select group of relevant field attributes. For the complete list,
1187  # see section 8.6.2 of the PDF 1.7 reference.
1188 
1189  def getFields(self, tree = None, retval = None, fileobj = None):
1190  """
1191  Extracts field data if this PDF contains interactive form fields.
1192  The *tree* and *retval* parameters are for recursive use.
1193 
1194  :param fileobj: A file object (usually a text file) to write
1195  a report to on all interactive form fields found.
1196  :return: A dictionary where each key is a field name, and each
1197  value is a :class:`Field<PyPDF2.generic.Field>` object. By
1198  default, the mapping name is used for keys.
1199  :rtype: dict, or ``None`` if form data could not be located.
1200  """
1201  fieldAttributes = {"/FT" : "Field Type", "/Parent" : "Parent",
1202  "/T" : "Field Name", "/TU" : "Alternate Field Name",
1203  "/TM" : "Mapping Name", "/Ff" : "Field Flags",
1204  "/V" : "Value", "/DV" : "Default Value"}
1205  if retval == None:
1206  retval = {}
1207  catalog = self.trailer["/Root"]
1208  # get the AcroForm tree
1209  if "/AcroForm" in catalog:
1210  tree = catalog["/AcroForm"]
1211  else:
1212  return None
1213  if tree == None:
1214  return retval
1215 
1216  self._checkKids(tree, retval, fileobj)
1217  for attr in fieldAttributes:
1218  if attr in tree:
1219  # Tree is a field
1220  self._buildField(tree, retval, fileobj, fieldAttributes)
1221  break
1222 
1223  if "/Fields" in tree:
1224  fields = tree["/Fields"]
1225  for f in fields:
1226  field = f.getObject()
1227  self._buildField(field, retval, fileobj, fieldAttributes)
1228 
1229  return retval
1230 
1231  def _buildField(self, field, retval, fileobj, fieldAttributes):
1232  self._checkKids(field, retval, fileobj)
1233  try:
1234  key = field["/TM"]
1235  except KeyError:
1236  try:
1237  key = field["/T"]
1238  except KeyError:
1239  # Ignore no-name field for now
1240  return
1241  if fileobj:
1242  self._writeField(fileobj, field, fieldAttributes)
1243  fileobj.write("\n")
1244  retval[key] = Field(field)
1245 
1246  def _checkKids(self, tree, retval, fileobj):
1247  if "/Kids" in tree:
1248  # recurse down the tree
1249  for kid in tree["/Kids"]:
1250  self.getFields(kid.getObject(), retval, fileobj)
1251 
1252  def _writeField(self, fileobj, field, fieldAttributes):
1253  order = ["/TM", "/T", "/FT", "/Parent", "/TU", "/Ff", "/V", "/DV"]
1254  for attr in order:
1255  attrName = fieldAttributes[attr]
1256  try:
1257  if attr == "/FT":
1258  # Make the field type value more clear
1259  types = {"/Btn":"Button", "/Tx":"Text", "/Ch": "Choice",
1260  "/Sig":"Signature"}
1261  if field[attr] in types:
1262  fileobj.write(attrName + ": " + types[field[attr]] + "\n")
1263  elif attr == "/Parent":
1264  # Let's just write the name of the parent
1265  try:
1266  name = field["/Parent"]["/TM"]
1267  except KeyError:
1268  name = field["/Parent"]["/T"]
1269  fileobj.write(attrName + ": " + name + "\n")
1270  else:
1271  fileobj.write(attrName + ": " + str(field[attr]) + "\n")
1272  except KeyError:
1273  # Field attribute is N/A or unknown, so don't write anything
1274  pass
1275 
1277  ''' Retrieves form fields from the document with textual data (inputs, dropdowns)
1278  '''
1279  # Retrieve document form fields
1280  formfields = self.getFields()
1281  return dict(
1282  (formfields[field]['/T'], formfields[field].get('/V')) for field in formfields \
1283  if formfields[field].get('/FT') == '/Tx'
1284  )
1285 
1286  def getNamedDestinations(self, tree=None, retval=None):
1287  """
1288  Retrieves the named destinations present in the document.
1289 
1290  :return: a dictionary which maps names to
1291  :class:`Destinations<PyPDF2.generic.Destination>`.
1292  :rtype: dict
1293  """
1294  if retval == None:
1295  retval = {}
1296  catalog = self.trailer["/Root"]
1297 
1298  # get the name tree
1299  if "/Dests" in catalog:
1300  tree = catalog["/Dests"]
1301  elif "/Names" in catalog:
1302  names = catalog['/Names']
1303  if "/Dests" in names:
1304  tree = names['/Dests']
1305 
1306  if tree == None:
1307  return retval
1308 
1309  if "/Kids" in tree:
1310  # recurse down the tree
1311  for kid in tree["/Kids"]:
1312  self.getNamedDestinations(kid.getObject(), retval)
1313 
1314  if "/Names" in tree:
1315  names = tree["/Names"]
1316  for i in range(0, len(names), 2):
1317  key = names[i].getObject()
1318  val = names[i+1].getObject()
1319  if isinstance(val, DictionaryObject) and '/D' in val:
1320  val = val['/D']
1321  dest = self._buildDestination(key, val)
1322  if dest != None:
1323  retval[key] = dest
1324 
1325  return retval
1326 
1327  outlines = property(lambda self: self.getOutlines(), None, None)
1328  """
1329  Read-only property that accesses the
1330  :meth:`getOutlines()<PdfFileReader.getOutlines>` function.
1331  """
1332 
1333  def getOutlines(self, node=None, outlines=None):
1334  """
1335  Retrieves the document outline present in the document.
1336 
1337  :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`.
1338  """
1339  if outlines == None:
1340  outlines = []
1341  catalog = self.trailer["/Root"]
1342 
1343  # get the outline dictionary and named destinations
1344  if "/Outlines" in catalog:
1345  try:
1346  lines = catalog["/Outlines"]
1347  except utils.PdfReadError:
1348  # this occurs if the /Outlines object reference is incorrect
1349  # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf
1350  # so continue to load the file without the Bookmarks
1351  return outlines
1352 
1353  if "/First" in lines:
1354  node = lines["/First"]
1355  self._namedDests = self.getNamedDestinations()
1356 
1357  if node == None:
1358  return outlines
1359 
1360  # see if there are any more outlines
1361  while True:
1362  outline = self._buildOutline(node)
1363  if outline:
1364  outlines.append(outline)
1365 
1366  # check for sub-outlines
1367  if "/First" in node:
1368  subOutlines = []
1369  self.getOutlines(node["/First"], subOutlines)
1370  if subOutlines:
1371  outlines.append(subOutlines)
1372 
1373  if "/Next" not in node:
1374  break
1375  node = node["/Next"]
1376 
1377  return outlines
1378 
1379  def _getPageNumberByIndirect(self, indirectRef):
1380  """Generate _pageId2Num"""
1381  if self._pageId2Num is None:
1382  id2num = {}
1383  for i, x in enumerate(self.pages):
1384  id2num[x.indirectRef.idnum] = i
1385  self._pageId2Num = id2num
1386 
1387  if isinstance(indirectRef, int):
1388  idnum = indirectRef
1389  else:
1390  idnum = indirectRef.idnum
1391 
1392  ret = self._pageId2Num.get(idnum, -1)
1393  return ret
1394 
1395  def getPageNumber(self, page):
1396  """
1397  Retrieve page number of a given PageObject
1398 
1399  :param PageObject page: The page to get page number. Should be
1400  an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
1401  :return: the page number or -1 if page not found
1402  :rtype: int
1403  """
1404  indirectRef = page.indirectRef
1405  ret = self._getPageNumberByIndirect(indirectRef)
1406  return ret
1407 
1408  def getDestinationPageNumber(self, destination):
1409  """
1410  Retrieve page number of a given Destination object
1411 
1412  :param Destination destination: The destination to get page number.
1413  Should be an instance of
1414  :class:`Destination<PyPDF2.pdf.Destination>`
1415  :return: the page number or -1 if page not found
1416  :rtype: int
1417  """
1418  indirectRef = destination.page
1419  ret = self._getPageNumberByIndirect(indirectRef)
1420  return ret
1421 
1422  def _buildDestination(self, title, array):
1423  page, typ = array[0:2]
1424  array = array[2:]
1425  return Destination(title, page, typ, *array)
1426 
1427  def _buildOutline(self, node):
1428  dest, title, outline = None, None, None
1429 
1430  if "/A" in node and "/Title" in node:
1431  # Action, section 8.5 (only type GoTo supported)
1432  title = node["/Title"]
1433  action = node["/A"]
1434  if action["/S"] == "/GoTo":
1435  dest = action["/D"]
1436  elif "/Dest" in node and "/Title" in node:
1437  # Destination, section 8.2.1
1438  title = node["/Title"]
1439  dest = node["/Dest"]
1440 
1441  # if destination found, then create outline
1442  if dest:
1443  if isinstance(dest, ArrayObject):
1444  outline = self._buildDestination(title, dest)
1445  elif isString(dest) and dest in self._namedDests:
1446  outline = self._namedDests[dest]
1447  outline[NameObject("/Title")] = title
1448  else:
1449  raise utils.PdfReadError("Unexpected destination %r" % dest)
1450  return outline
1451 
1452  pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage),
1453  None, None)
1454  """
1455  Read-only property that emulates a list based upon the
1456  :meth:`getNumPages()<PdfFileReader.getNumPages>` and
1457  :meth:`getPage()<PdfFileReader.getPage>` methods.
1458  """
1459 
1460  def getPageLayout(self):
1461  """
1462  Get the page layout.
1463  See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>`
1464  for a description of valid layouts.
1465 
1466  :return: Page layout currently being used.
1467  :rtype: ``str``, ``None`` if not specified
1468  """
1469  try:
1470  return self.trailer['/Root']['/PageLayout']
1471  except KeyError:
1472  return None
1473 
1474  pageLayout = property(getPageLayout)
1475  """Read-only property accessing the
1476  :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method."""
1477 
1478  def getPageMode(self):
1479  """
1480  Get the page mode.
1481  See :meth:`setPageMode()<PdfFileWriter.setPageMode>`
1482  for a description of valid modes.
1483 
1484  :return: Page mode currently being used.
1485  :rtype: ``str``, ``None`` if not specified
1486  """
1487  try:
1488  return self.trailer['/Root']['/PageMode']
1489  except KeyError:
1490  return None
1491 
1492  pageMode = property(getPageMode)
1493  """Read-only property accessing the
1494  :meth:`getPageMode()<PdfFileReader.getPageMode>` method."""
1495 
1496  def _flatten(self, pages=None, inherit=None, indirectRef=None):
1497  inheritablePageAttributes = (
1498  NameObject("/Resources"), NameObject("/MediaBox"),
1499  NameObject("/CropBox"), NameObject("/Rotate")
1500  )
1501  if inherit == None:
1502  inherit = dict()
1503  if pages == None:
1504  self.flattenedPages = []
1505  catalog = self.trailer["/Root"].getObject()
1506  pages = catalog["/Pages"].getObject()
1507 
1508  t = "/Pages"
1509  if "/Type" in pages:
1510  t = pages["/Type"]
1511 
1512  if t == "/Pages":
1513  for attr in inheritablePageAttributes:
1514  if attr in pages:
1515  inherit[attr] = pages[attr]
1516  for page in pages["/Kids"]:
1517  addt = {}
1518  if isinstance(page, IndirectObject):
1519  addt["indirectRef"] = page
1520  self._flatten(page.getObject(), inherit, **addt)
1521  elif t == "/Page":
1522  for attr, value in list(inherit.items()):
1523  # if the page has it's own value, it does not inherit the
1524  # parent's value:
1525  if attr not in pages:
1526  pages[attr] = value
1527  pageObj = PageObject(self, indirectRef)
1528  pageObj.update(pages)
1529  self.flattenedPages.append(pageObj)
1530 
1531  def _getObjectFromStream(self, indirectReference):
1532  # indirect reference to object in object stream
1533  # read the entire object stream into memory
1534  debug = False
1535  stmnum, idx = self.xref_objStm[indirectReference.idnum]
1536  if debug: print(("Here1: %s %s"%(stmnum, idx)))
1537  objStm = IndirectObject(stmnum, 0, self).getObject()
1538  if debug: print(("Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData())))
1539  # This is an xref to a stream, so its type better be a stream
1540  assert objStm['/Type'] == '/ObjStm'
1541  # /N is the number of indirect objects in the stream
1542  assert idx < objStm['/N']
1543  streamData = BytesIO(b_(objStm.getData()))
1544  for i in range(objStm['/N']):
1545  readNonWhitespace(streamData)
1546  streamData.seek(-1, 1)
1547  objnum = NumberObject.readFromStream(streamData)
1548  readNonWhitespace(streamData)
1549  streamData.seek(-1, 1)
1550  offset = NumberObject.readFromStream(streamData)
1551  readNonWhitespace(streamData)
1552  streamData.seek(-1, 1)
1553  if objnum != indirectReference.idnum:
1554  # We're only interested in one object
1555  continue
1556  if self.strict and idx != i:
1557  raise utils.PdfReadError("Object is in wrong index.")
1558  streamData.seek(objStm['/First']+offset, 0)
1559  if debug:
1560  pos = streamData.tell()
1561  streamData.seek(0, 0)
1562  lines = streamData.readlines()
1563  for i in range(0, len(lines)):
1564  print((lines[i]))
1565  streamData.seek(pos, 0)
1566  try:
1567  obj = readObject(streamData, self)
1568  except utils.PdfStreamError as e:
1569  # Stream object cannot be read. Normally, a critical error, but
1570  # Adobe Reader doesn't complain, so continue (in strict mode?)
1571  e = sys.exc_info()[1]
1572  warnings.warn("Invalid stream (index %d) within object %d %d: %s" % \
1573  (i, indirectReference.idnum, indirectReference.generation, e), utils.PdfReadWarning)
1574 
1575  if self.strict:
1576  raise utils.PdfReadError("Can't read object stream: %s"%e)
1577  # Replace with null. Hopefully it's nothing important.
1578  obj = NullObject()
1579  return obj
1580 
1581  if self.strict: raise utils.PdfReadError("This is a fatal error in strict mode.")
1582  return NullObject()
1583 
1584  def getObject(self, indirectReference):
1585  debug = False
1586  if debug: print(("looking at:", indirectReference.idnum, indirectReference.generation))
1587  retval = self.cacheGetIndirectObject(indirectReference.generation,
1588  indirectReference.idnum)
1589  if retval != None:
1590  return retval
1591  if indirectReference.generation == 0 and \
1592  indirectReference.idnum in self.xref_objStm:
1593  retval = self._getObjectFromStream(indirectReference)
1594  elif indirectReference.generation in self.xref and \
1595  indirectReference.idnum in self.xref[indirectReference.generation]:
1596  start = self.xref[indirectReference.generation][indirectReference.idnum]
1597  if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start))
1598  self.stream.seek(start, 0)
1599  idnum, generation = self.readObjectHeader(self.stream)
1600  if idnum != indirectReference.idnum and self.xrefIndex:
1601  # Xref table probably had bad indexes due to not being zero-indexed
1602  if self.strict:
1603  raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \
1604  % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1605  else: pass # xref table is corrected in non-strict mode
1606  elif idnum != indirectReference.idnum:
1607  # some other problem
1608  raise utils.PdfReadError("Expected object ID (%d %d) does not match actual (%d %d)." \
1609  % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1610  assert generation == indirectReference.generation
1611  retval = readObject(self.stream, self)
1612 
1613  # override encryption is used for the /Encrypt dictionary
1614  if not self._override_encryption and self.isEncrypted:
1615  # if we don't have the encryption key:
1616  if not hasattr(self, '_decryption_key'):
1617  raise utils.PdfReadError("file has not been decrypted")
1618  # otherwise, decrypt here...
1619  import struct
1620  pack1 = struct.pack("<i", indirectReference.idnum)[:3]
1621  pack2 = struct.pack("<i", indirectReference.generation)[:2]
1622  key = self._decryption_key + pack1 + pack2
1623  assert len(key) == (len(self._decryption_key) + 5)
1624  md5_hash = md5(key).digest()
1625  key = md5_hash[:min(16, len(self._decryption_key) + 5)]
1626  retval = self._decryptObject(retval, key)
1627  else:
1628  warnings.warn("Object %d %d not defined."%(indirectReference.idnum,
1629  indirectReference.generation), utils.PdfReadWarning)
1630  #if self.strict:
1631  raise utils.PdfReadError("Could not find object.")
1632  self.cacheIndirectObject(indirectReference.generation,
1633  indirectReference.idnum, retval)
1634  return retval
1635 
1636  def _decryptObject(self, obj, key):
1637  if isinstance(obj, ByteStringObject) or isinstance(obj, TextStringObject):
1638  obj = createStringObject(utils.RC4_encrypt(key, obj.original_bytes))
1639  elif isinstance(obj, StreamObject):
1640  obj._data = utils.RC4_encrypt(key, obj._data)
1641  elif isinstance(obj, DictionaryObject):
1642  for dictkey, value in list(obj.items()):
1643  obj[dictkey] = self._decryptObject(value, key)
1644  elif isinstance(obj, ArrayObject):
1645  for i in range(len(obj)):
1646  obj[i] = self._decryptObject(obj[i], key)
1647  return obj
1648 
1649  def readObjectHeader(self, stream):
1650  # Should never be necessary to read out whitespace, since the
1651  # cross-reference table should put us in the right spot to read the
1652  # object header. In reality... some files have stupid cross reference
1653  # tables that are off by whitespace bytes.
1654  extra = False
1655  utils.skipOverComment(stream)
1656  extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
1657  idnum = readUntilWhitespace(stream)
1658  extra |= utils.skipOverWhitespace(stream); stream.seek(-1, 1)
1659  generation = readUntilWhitespace(stream)
1660  obj = stream.read(3)
1661  readNonWhitespace(stream)
1662  stream.seek(-1, 1)
1663  if (extra and self.strict):
1664  #not a fatal error
1665  warnings.warn("Superfluous whitespace found in object header %s %s" % \
1666  (idnum, generation), utils.PdfReadWarning)
1667  return int(idnum), int(generation)
1668 
1669  def cacheGetIndirectObject(self, generation, idnum):
1670  debug = False
1671  out = self.resolvedObjects.get((generation, idnum))
1672  if debug and out: print(("cache hit: %d %d"%(idnum, generation)))
1673  elif debug: print(("cache miss: %d %d"%(idnum, generation)))
1674  return out
1675 
1676  def cacheIndirectObject(self, generation, idnum, obj):
1677  # return None # Sometimes we want to turn off cache for debugging.
1678  if (generation, idnum) in self.resolvedObjects:
1679  msg = "Overwriting cache for %s %s"%(generation, idnum)
1680  if self.strict: raise utils.PdfReadError(msg)
1681  else: warnings.warn(msg)
1682  self.resolvedObjects[(generation, idnum)] = obj
1683  return obj
1684 
1685  def read(self, stream):
1686  debug = False
1687  if debug: print(">>read", stream)
1688  # start at the end:
1689  stream.seek(-1, 2)
1690  if not stream.tell():
1691  raise utils.PdfReadError('Cannot read an empty file')
1692  last1K = stream.tell() - 1024 + 1 # offset of last 1024 bytes of stream
1693  line = b_('')
1694  while line[:5] != b_("%%EOF"):
1695  if stream.tell() < last1K:
1696  raise utils.PdfReadError("EOF marker not found")
1697  line = self.readNextEndLine(stream)
1698  if debug: print(" line:",line)
1699 
1700  # find startxref entry - the location of the xref table
1701  line = self.readNextEndLine(stream)
1702  try:
1703  startxref = int(line)
1704  except ValueError:
1705  # 'startxref' may be on the same line as the location
1706  if not line.startswith(b_("startxref")):
1707  raise utils.PdfReadError("startxref not found")
1708  startxref = int(line[9:].strip())
1709  warnings.warn("startxref on same line as offset")
1710  else:
1711  line = self.readNextEndLine(stream)
1712  if line[:9] != b_("startxref"):
1713  raise utils.PdfReadError("startxref not found")
1714 
1715  # read all cross reference tables and their trailers
1716  self.xref = {}
1717  self.xref_objStm = {}
1719  while True:
1720  # load the xref table
1721  stream.seek(startxref, 0)
1722  x = stream.read(1)
1723  if x == b_("x"):
1724  # standard cross-reference table
1725  ref = stream.read(4)
1726  if ref[:3] != b_("ref"):
1727  raise utils.PdfReadError("xref table read error")
1728  readNonWhitespace(stream)
1729  stream.seek(-1, 1)
1730  firsttime = True; # check if the first time looking at the xref table
1731  while True:
1732  num = readObject(stream, self)
1733  if firsttime and num != 0:
1734  self.xrefIndex = num
1735  if self.strict:
1736  warnings.warn("Xref table not zero-indexed. ID numbers for objects will be corrected.", utils.PdfReadWarning)
1737  #if table not zero indexed, could be due to error from when PDF was created
1738  #which will lead to mismatched indices later on, only warned and corrected if self.strict=True
1739  firsttime = False
1740  readNonWhitespace(stream)
1741  stream.seek(-1, 1)
1742  size = readObject(stream, self)
1743  readNonWhitespace(stream)
1744  stream.seek(-1, 1)
1745  cnt = 0
1746  while cnt < size:
1747  line = stream.read(20)
1748 
1749  # It's very clear in section 3.4.3 of the PDF spec
1750  # that all cross-reference table lines are a fixed
1751  # 20 bytes (as of PDF 1.7). However, some files have
1752  # 21-byte entries (or more) due to the use of \r\n
1753  # (CRLF) EOL's. Detect that case, and adjust the line
1754  # until it does not begin with a \r (CR) or \n (LF).
1755  while line[0] in b_("\x0D\x0A"):
1756  stream.seek(-20 + 1, 1)
1757  line = stream.read(20)
1758 
1759  # On the other hand, some malformed PDF files
1760  # use a single character EOL without a preceeding
1761  # space. Detect that case, and seek the stream
1762  # back one character. (0-9 means we've bled into
1763  # the next xref entry, t means we've bled into the
1764  # text "trailer"):
1765  if line[-1] in b_("0123456789t"):
1766  stream.seek(-1, 1)
1767 
1768  offset, generation = line[:16].split(b_(" "))
1769  offset, generation = int(offset), int(generation)
1770  if generation not in self.xref:
1771  self.xref[generation] = {}
1772  if num in self.xref[generation]:
1773  # It really seems like we should allow the last
1774  # xref table in the file to override previous
1775  # ones. Since we read the file backwards, assume
1776  # any existing key is already set correctly.
1777  pass
1778  else:
1779  self.xref[generation][num] = offset
1780  cnt += 1
1781  num += 1
1782  readNonWhitespace(stream)
1783  stream.seek(-1, 1)
1784  trailertag = stream.read(7)
1785  if trailertag != b_("trailer"):
1786  # more xrefs!
1787  stream.seek(-7, 1)
1788  else:
1789  break
1790  readNonWhitespace(stream)
1791  stream.seek(-1, 1)
1792  newTrailer = readObject(stream, self)
1793  for key, value in list(newTrailer.items()):
1794  if key not in self.trailer:
1795  self.trailer[key] = value
1796  if "/Prev" in newTrailer:
1797  startxref = newTrailer["/Prev"]
1798  else:
1799  break
1800  elif x.isdigit():
1801  # PDF 1.5+ Cross-Reference Stream
1802  stream.seek(-1, 1)
1803  idnum, generation = self.readObjectHeader(stream)
1804  xrefstream = readObject(stream, self)
1805  assert xrefstream["/Type"] == "/XRef"
1806  self.cacheIndirectObject(generation, idnum, xrefstream)
1807  streamData = BytesIO(b_(xrefstream.getData()))
1808  # Index pairs specify the subsections in the dictionary. If
1809  # none create one subsection that spans everything.
1810  idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
1811  if debug: print(("read idx_pairs=%s"%list(self._pairs(idx_pairs))))
1812  entrySizes = xrefstream.get("/W")
1813  assert len(entrySizes) >= 3
1814  if self.strict and len(entrySizes) > 3:
1815  raise utils.PdfReadError("Too many entry sizes: %s" %entrySizes)
1816 
1817  def getEntry(i):
1818  # Reads the correct number of bytes for each entry. See the
1819  # discussion of the W parameter in PDF spec table 17.
1820  if entrySizes[i] > 0:
1821  d = streamData.read(entrySizes[i])
1822  return convertToInt(d, entrySizes[i])
1823 
1824  # PDF Spec Table 17: A value of zero for an element in the
1825  # W array indicates...the default value shall be used
1826  if i == 0: return 1 # First value defaults to 1
1827  else: return 0
1828 
1829  def used_before(num, generation):
1830  # We move backwards through the xrefs, don't replace any.
1831  return num in self.xref.get(generation, []) or \
1832  num in self.xref_objStm
1833 
1834  # Iterate through each subsection
1835  last_end = 0
1836  for start, size in self._pairs(idx_pairs):
1837  # The subsections must increase
1838  assert start >= last_end
1839  last_end = start + size
1840  for num in range(start, start+size):
1841  # The first entry is the type
1842  xref_type = getEntry(0)
1843  # The rest of the elements depend on the xref_type
1844  if xref_type == 0:
1845  # linked list of free objects
1846  next_free_object = getEntry(1)
1847  next_generation = getEntry(2)
1848  elif xref_type == 1:
1849  # objects that are in use but are not compressed
1850  byte_offset = getEntry(1)
1851  generation = getEntry(2)
1852  if generation not in self.xref:
1853  self.xref[generation] = {}
1854  if not used_before(num, generation):
1855  self.xref[generation][num] = byte_offset
1856  if debug: print(("XREF Uncompressed: %s %s"%(
1857  num, generation)))
1858  elif xref_type == 2:
1859  # compressed objects
1860  objstr_num = getEntry(1)
1861  obstr_idx = getEntry(2)
1862  generation = 0 # PDF spec table 18, generation is 0
1863  if not used_before(num, generation):
1864  if debug: print(("XREF Compressed: %s %s %s"%(
1865  num, objstr_num, obstr_idx)))
1866  self.xref_objStm[num] = (objstr_num, obstr_idx)
1867  elif self.strict:
1868  raise utils.PdfReadError("Unknown xref type: %s"%
1869  xref_type)
1870 
1871  trailerKeys = "/Root", "/Encrypt", "/Info", "/ID"
1872  for key in trailerKeys:
1873  if key in xrefstream and key not in self.trailer:
1874  self.trailer[NameObject(key)] = xrefstream.raw_get(key)
1875  if "/Prev" in xrefstream:
1876  startxref = xrefstream["/Prev"]
1877  else:
1878  break
1879  else:
1880  # bad xref character at startxref. Let's see if we can find
1881  # the xref table nearby, as we've observed this error with an
1882  # off-by-one before.
1883  stream.seek(-11, 1)
1884  tmp = stream.read(20)
1885  xref_loc = tmp.find(b_("xref"))
1886  if xref_loc != -1:
1887  startxref -= (10 - xref_loc)
1888  continue
1889  # No explicit xref table, try finding a cross-reference stream.
1890  stream.seek(startxref, 0)
1891  found = False
1892  for look in range(5):
1893  if stream.read(1).isdigit():
1894  # This is not a standard PDF, consider adding a warning
1895  startxref += look
1896  found = True
1897  break
1898  if found:
1899  continue
1900  # no xref table found at specified location
1901  raise utils.PdfReadError("Could not find xref table at specified location")
1902  #if not zero-indexed, verify that the table is correct; change it if necessary
1903  if self.xrefIndex and not self.strict:
1904  loc = stream.tell()
1905  for gen in self.xref:
1906  if gen == 65535: continue
1907  for id in self.xref[gen]:
1908  stream.seek(self.xref[gen][id], 0)
1909  try:
1910  pid, pgen = self.readObjectHeader(stream)
1911  except ValueError:
1912  break
1913  if pid == id - self.xrefIndex:
1914  self._zeroXref(gen)
1915  break
1916  #if not, then either it's just plain wrong, or the non-zero-index is actually correct
1917  stream.seek(loc, 0) #return to where it was
1918 
1919  def _zeroXref(self, generation):
1920  self.xref[generation] = dict( (k-self.xrefIndex, v) for (k, v) in list(self.xref[generation].items()) )
1921 
1922  def _pairs(self, array):
1923  i = 0
1924  while True:
1925  yield array[i], array[i+1]
1926  i += 2
1927  if (i+1) >= len(array):
1928  break
1929 
1930  def readNextEndLine(self, stream):
1931  debug = False
1932  if debug: print(">>readNextEndLine")
1933  line = b_("")
1934  while True:
1935  # Prevent infinite loops in malformed PDFs
1936  if stream.tell() == 0:
1937  raise utils.PdfReadError("Could not read malformed PDF file")
1938  x = stream.read(1)
1939  if debug: print((" x:", x, "%x"%ord(x)))
1940  if stream.tell() < 2:
1941  raise utils.PdfReadError("EOL marker not found")
1942  stream.seek(-2, 1)
1943  if x == b_('\n') or x == b_('\r'):
1944  crlf = False
1945  while x == b_('\n') or x == b_('\r'):
1946  if debug:
1947  if ord(x) == 0x0D: print(" x is CR 0D")
1948  elif ord(x) == 0x0A: print(" x is LF 0A")
1949  x = stream.read(1)
1950  if x == b_('\n') or x == b_('\r'): # account for CR+LF
1951  stream.seek(-1, 1)
1952  crlf = True
1953  if stream.tell() < 2:
1954  raise utils.PdfReadError("EOL marker not found")
1955  stream.seek(-2, 1)
1956  stream.seek(2 if crlf else 1, 1) #if using CR+LF, go back 2 bytes, else 1
1957  break
1958  else:
1959  if debug: print(" x is neither")
1960  line = x + line
1961  if debug: print((" RNEL line:", line))
1962  if debug: print("leaving RNEL")
1963  return line
1964 
1965  def decrypt(self, password):
1966  """
1967  When using an encrypted / secured PDF file with the PDF Standard
1968  encryption handler, this function will allow the file to be decrypted.
1969  It checks the given password against the document's user password and
1970  owner password, and then stores the resulting decryption key if either
1971  password is correct.
1972 
1973  It does not matter which password was matched. Both passwords provide
1974  the correct decryption key that will allow the document to be used with
1975  this library.
1976 
1977  :param str password: The password to match.
1978  :return: ``0`` if the password failed, ``1`` if the password matched the user
1979  password, and ``2`` if the password matched the owner password.
1980  :rtype: int
1981  :raises NotImplementedError: if document uses an unsupported encryption
1982  method.
1983  """
1984 
1985  self._override_encryption = True
1986  try:
1987  return self._decrypt(password)
1988  finally:
1989  self._override_encryption = False
1990 
1991  def _decrypt(self, password):
1992  encrypt = self.trailer['/Encrypt'].getObject()
1993  if encrypt['/Filter'] != '/Standard':
1994  raise NotImplementedError("only Standard PDF encryption handler is available")
1995  if not (encrypt['/V'] in (1, 2)):
1996  raise NotImplementedError("only algorithm code 1 and 2 are supported")
1997  user_password, key = self._authenticateUserPassword(password)
1998  if user_password:
1999  self._decryption_key = key
2000  return 1
2001  else:
2002  rev = encrypt['/R'].getObject()
2003  if rev == 2:
2004  keylen = 5
2005  else:
2006  keylen = encrypt['/Length'].getObject() // 8
2007  key = _alg33_1(password, rev, keylen)
2008  real_O = encrypt["/O"].getObject()
2009  if rev == 2:
2010  userpass = utils.RC4_encrypt(key, real_O)
2011  else:
2012  val = real_O
2013  for i in range(19, -1, -1):
2014  new_key = b_('')
2015  for l in range(len(key)):
2016  new_key += b_(chr(utils.ord_(key[l]) ^ i))
2017  val = utils.RC4_encrypt(new_key, val)
2018  userpass = val
2019  owner_password, key = self._authenticateUserPassword(userpass)
2020  if owner_password:
2021  self._decryption_key = key
2022  return 2
2023  return 0
2024 
2025  def _authenticateUserPassword(self, password):
2026  encrypt = self.trailer['/Encrypt'].getObject()
2027  rev = encrypt['/R'].getObject()
2028  owner_entry = encrypt['/O'].getObject()
2029  p_entry = encrypt['/P'].getObject()
2030  id_entry = self.trailer['/ID'].getObject()
2031  id1_entry = id_entry[0].getObject()
2032  real_U = encrypt['/U'].getObject().original_bytes
2033  if rev == 2:
2034  U, key = _alg34(password, owner_entry, p_entry, id1_entry)
2035  elif rev >= 3:
2036  U, key = _alg35(password, rev,
2037  encrypt["/Length"].getObject() // 8, owner_entry,
2038  p_entry, id1_entry,
2039  encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject())
2040  U, real_U = U[:16], real_U[:16]
2041  return U == real_U, key
2042 
2043  def getIsEncrypted(self):
2044  return "/Encrypt" in self.trailer
2045 
2046  isEncrypted = property(lambda self: self.getIsEncrypted(), None, None)
2047  """
2048  Read-only boolean property showing whether this PDF file is encrypted.
2049  Note that this property, if true, will remain true even after the
2050  :meth:`decrypt()<PdfFileReader.decrypt>` method is called.
2051  """
2052 
2053 
2054 def getRectangle(self, name, defaults):
2055  retval = self.get(name)
2056  if isinstance(retval, RectangleObject):
2057  return retval
2058  if retval == None:
2059  for d in defaults:
2060  retval = self.get(d)
2061  if retval != None:
2062  break
2063  if isinstance(retval, IndirectObject):
2064  retval = self.pdf.getObject(retval)
2065  retval = RectangleObject(retval)
2066  setRectangle(self, name, retval)
2067  return retval
2068 
2069 
2070 def setRectangle(self, name, value):
2071  if not isinstance(name, NameObject):
2072  name = NameObject(name)
2073  self[name] = value
2074 
2075 
2076 def deleteRectangle(self, name):
2077  del self[name]
2078 
2079 
2080 def createRectangleAccessor(name, fallback):
2081  return \
2082  property(
2083  lambda self: getRectangle(self, name, fallback),
2084  lambda self, value: setRectangle(self, name, value),
2085  lambda self: deleteRectangle(self, name)
2086  )
2087 
2088 
2090  """
2091  This class represents a single page within a PDF file. Typically this
2092  object will be created by accessing the
2093  :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the
2094  :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is
2095  also possible to create an empty page with the
2096  :meth:`createBlankPage()<PageObject.createBlankPage>` static method.
2097 
2098  :param pdf: PDF file the page belongs to.
2099  :param indirectRef: Stores the original indirect reference to
2100  this object in its source PDF
2101  """
2102  def __init__(self, pdf=None, indirectRef=None):
2103  DictionaryObject.__init__(self)
2104  self.pdf = pdf
2105  self.indirectRef = indirectRef
2106 
2107  def createBlankPage(pdf=None, width=None, height=None):
2108  """
2109  Returns a new blank page.
2110  If ``width`` or ``height`` is ``None``, try to get the page size
2111  from the last page of *pdf*.
2112 
2113  :param pdf: PDF file the page belongs to
2114  :param float width: The width of the new page expressed in default user
2115  space units.
2116  :param float height: The height of the new page expressed in default user
2117  space units.
2118  :return: the new blank page:
2119  :rtype: :class:`PageObject<PageObject>`
2120  :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
2121  no page
2122  """
2123  page = PageObject(pdf)
2124 
2125  # Creates a new page (cf PDF Reference 7.7.3.3)
2126  page.__setitem__(NameObject('/Type'), NameObject('/Page'))
2127  page.__setitem__(NameObject('/Parent'), NullObject())
2128  page.__setitem__(NameObject('/Resources'), DictionaryObject())
2129  if width is None or height is None:
2130  if pdf is not None and pdf.getNumPages() > 0:
2131  lastpage = pdf.getPage(pdf.getNumPages() - 1)
2132  width = lastpage.mediaBox.getWidth()
2133  height = lastpage.mediaBox.getHeight()
2134  else:
2136  page.__setitem__(NameObject('/MediaBox'),
2137  RectangleObject([0, 0, width, height]))
2138 
2139  return page
2140  createBlankPage = staticmethod(createBlankPage)
2141 
2142  def rotateClockwise(self, angle):
2143  """
2144  Rotates a page clockwise by increments of 90 degrees.
2145 
2146  :param int angle: Angle to rotate the page. Must be an increment
2147  of 90 deg.
2148  """
2149  assert angle % 90 == 0
2150  self._rotate(angle)
2151  return self
2152 
2153  def rotateCounterClockwise(self, angle):
2154  """
2155  Rotates a page counter-clockwise by increments of 90 degrees.
2156 
2157  :param int angle: Angle to rotate the page. Must be an increment
2158  of 90 deg.
2159  """
2160  assert angle % 90 == 0
2161  self._rotate(-angle)
2162  return self
2163 
2164  def _rotate(self, angle):
2165  currentAngle = self.get("/Rotate", 0)
2166  self[NameObject("/Rotate")] = NumberObject(currentAngle + angle)
2167 
2168  def _mergeResources(res1, res2, resource):
2169  newRes = DictionaryObject()
2170  newRes.update(res1.get(resource, DictionaryObject()).getObject())
2171  page2Res = res2.get(resource, DictionaryObject()).getObject()
2172  renameRes = {}
2173  for key in list(page2Res.keys()):
2174  if key in newRes and newRes.raw_get(key) != page2Res.raw_get(key):
2175  newname = NameObject(key + str(uuid.uuid4()))
2176  renameRes[key] = newname
2177  newRes[newname] = page2Res[key]
2178  elif key not in newRes:
2179  newRes[key] = page2Res.raw_get(key)
2180  return newRes, renameRes
2181  _mergeResources = staticmethod(_mergeResources)
2182 
2183  def _contentStreamRename(stream, rename, pdf):
2184  if not rename:
2185  return stream
2186  stream = ContentStream(stream, pdf)
2187  for operands, operator in stream.operations:
2188  for i in range(len(operands)):
2189  op = operands[i]
2190  if isinstance(op, NameObject):
2191  operands[i] = rename.get(op,op)
2192  return stream
2193  _contentStreamRename = staticmethod(_contentStreamRename)
2194 
2195  def _pushPopGS(contents, pdf):
2196  # adds a graphics state "push" and "pop" to the beginning and end
2197  # of a content stream. This isolates it from changes such as
2198  # transformation matricies.
2199  stream = ContentStream(contents, pdf)
2200  stream.operations.insert(0, [[], "q"])
2201  stream.operations.append([[], "Q"])
2202  return stream
2203  _pushPopGS = staticmethod(_pushPopGS)
2204 
2205  def _addTransformationMatrix(contents, pdf, ctm):
2206  # adds transformation matrix at the beginning of the given
2207  # contents stream.
2208  a, b, c, d, e, f = ctm
2209  contents = ContentStream(contents, pdf)
2210  contents.operations.insert(0, [[FloatObject(a), FloatObject(b),
2211  FloatObject(c), FloatObject(d), FloatObject(e),
2212  FloatObject(f)], " cm"])
2213  return contents
2214  _addTransformationMatrix = staticmethod(_addTransformationMatrix)
2215 
2216  def getContents(self):
2217  """
2218  Accesses the page contents.
2219 
2220  :return: the ``/Contents`` object, or ``None`` if it doesn't exist.
2221  ``/Contents`` is optional, as described in PDF Reference 7.7.3.3
2222  """
2223  if "/Contents" in self:
2224  return self["/Contents"].getObject()
2225  else:
2226  return None
2227 
2228  def mergePage(self, page2):
2229  """
2230  Merges the content streams of two pages into one. Resource references
2231  (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc
2232  of this page are not altered. The parameter page's content stream will
2233  be added to the end of this page's content stream, meaning that it will
2234  be drawn after, or "on top" of this page.
2235 
2236  :param PageObject page2: The page to be merged into this one. Should be
2237  an instance of :class:`PageObject<PageObject>`.
2238  """
2239  self._mergePage(page2)
2240 
2241  def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False):
2242  # First we work on merging the resource dictionaries. This allows us
2243  # to find out what symbols in the content streams we might need to
2244  # rename.
2245 
2246  newResources = DictionaryObject()
2247  rename = {}
2248  originalResources = self["/Resources"].getObject()
2249  page2Resources = page2["/Resources"].getObject()
2250  newAnnots = ArrayObject()
2251 
2252  for page in (self, page2):
2253  if "/Annots" in page:
2254  annots = page["/Annots"]
2255  if isinstance(annots, ArrayObject):
2256  for ref in annots:
2257  newAnnots.append(ref)
2258 
2259  for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading", "/Properties":
2260  new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
2261  if new:
2262  newResources[NameObject(res)] = new
2263  rename.update(newrename)
2264 
2265  # Combine /ProcSet sets.
2266  newResources[NameObject("/ProcSet")] = ArrayObject(
2267  frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union(
2268  frozenset(page2Resources.get("/ProcSet", ArrayObject()).getObject())
2269  )
2270  )
2271 
2272  newContentArray = ArrayObject()
2273 
2274  originalContent = self.getContents()
2275  if originalContent is not None:
2276  newContentArray.append(PageObject._pushPopGS(
2277  originalContent, self.pdf))
2278 
2279  page2Content = page2.getContents()
2280  if page2Content is not None:
2281  if page2transformation is not None:
2282  page2Content = page2transformation(page2Content)
2283  page2Content = PageObject._contentStreamRename(
2284  page2Content, rename, self.pdf)
2285  page2Content = PageObject._pushPopGS(page2Content, self.pdf)
2286  newContentArray.append(page2Content)
2287 
2288  # if expanding the page to fit a new page, calculate the new media box size
2289  if expand:
2290  corners1 = [self.mediaBox.getLowerLeft_x().as_numeric(), self.mediaBox.getLowerLeft_y().as_numeric(),
2291  self.mediaBox.getUpperRight_x().as_numeric(), self.mediaBox.getUpperRight_y().as_numeric()]
2292  corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
2293  page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(),
2294  page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(),
2295  page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()]
2296  if ctm is not None:
2297  ctm = [float(x) for x in ctm]
2298  new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4] for i in range(0, 8, 2)]
2299  new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5] for i in range(0, 8, 2)]
2300  else:
2301  new_x = corners2[0:8:2]
2302  new_y = corners2[1:8:2]
2303  lowerleft = [min(new_x), min(new_y)]
2304  upperright = [max(new_x), max(new_y)]
2305  lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])]
2306  upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])]
2307 
2308  self.mediaBox.setLowerLeft(lowerleft)
2309  self.mediaBox.setUpperRight(upperright)
2310 
2311  self[NameObject('/Contents')] = ContentStream(newContentArray, self.pdf)
2312  self[NameObject('/Resources')] = newResources
2313  self[NameObject('/Annots')] = newAnnots
2314 
2315  def mergeTransformedPage(self, page2, ctm, expand=False):
2316  """
2317  This is similar to mergePage, but a transformation matrix is
2318  applied to the merged stream.
2319 
2320  :param PageObject page2: The page to be merged into this one. Should be
2321  an instance of :class:`PageObject<PageObject>`.
2322  :param tuple ctm: a 6-element tuple containing the operands of the
2323  transformation matrix
2324  :param bool expand: Whether the page should be expanded to fit the dimensions
2325  of the page to be merged.
2326  """
2327  self._mergePage(page2, lambda page2Content:
2328  PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand)
2329 
2330  def mergeScaledPage(self, page2, scale, expand=False):
2331  """
2332  This is similar to mergePage, but the stream to be merged is scaled
2333  by appling a transformation matrix.
2334 
2335  :param PageObject page2: The page to be merged into this one. Should be
2336  an instance of :class:`PageObject<PageObject>`.
2337  :param float scale: The scaling factor
2338  :param bool expand: Whether the page should be expanded to fit the
2339  dimensions of the page to be merged.
2340  """
2341  # CTM to scale : [ sx 0 0 sy 0 0 ]
2342  return self.mergeTransformedPage(page2, [scale, 0,
2343  0, scale,
2344  0, 0], expand)
2345 
2346  def mergeRotatedPage(self, page2, rotation, expand=False):
2347  """
2348  This is similar to mergePage, but the stream to be merged is rotated
2349  by appling a transformation matrix.
2350 
2351  :param PageObject page2: the page to be merged into this one. Should be
2352  an instance of :class:`PageObject<PageObject>`.
2353  :param float rotation: The angle of the rotation, in degrees
2354  :param bool expand: Whether the page should be expanded to fit the
2355  dimensions of the page to be merged.
2356  """
2357  rotation = math.radians(rotation)
2358  return self.mergeTransformedPage(page2,
2359  [math.cos(rotation), math.sin(rotation),
2360  -math.sin(rotation), math.cos(rotation),
2361  0, 0], expand)
2362 
2363  def mergeTranslatedPage(self, page2, tx, ty, expand=False):
2364  """
2365  This is similar to mergePage, but the stream to be merged is translated
2366  by appling a transformation matrix.
2367 
2368  :param PageObject page2: the page to be merged into this one. Should be
2369  an instance of :class:`PageObject<PageObject>`.
2370  :param float tx: The translation on X axis
2371  :param float ty: The translation on Y axis
2372  :param bool expand: Whether the page should be expanded to fit the
2373  dimensions of the page to be merged.
2374  """
2375  return self.mergeTransformedPage(page2, [1, 0,
2376  0, 1,
2377  tx, ty], expand)
2378 
2379  def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False):
2380  """
2381  This is similar to mergePage, but the stream to be merged is rotated
2382  and translated by appling a transformation matrix.
2383 
2384  :param PageObject page2: the page to be merged into this one. Should be
2385  an instance of :class:`PageObject<PageObject>`.
2386  :param float tx: The translation on X axis
2387  :param float ty: The translation on Y axis
2388  :param float rotation: The angle of the rotation, in degrees
2389  :param bool expand: Whether the page should be expanded to fit the
2390  dimensions of the page to be merged.
2391  """
2392 
2393  translation = [[1, 0, 0],
2394  [0, 1, 0],
2395  [-tx, -ty, 1]]
2396  rotation = math.radians(rotation)
2397  rotating = [[math.cos(rotation), math.sin(rotation), 0],
2398  [-math.sin(rotation), math.cos(rotation), 0],
2399  [0, 0, 1]]
2400  rtranslation = [[1, 0, 0],
2401  [0, 1, 0],
2402  [tx, ty, 1]]
2403  ctm = utils.matrixMultiply(translation, rotating)
2404  ctm = utils.matrixMultiply(ctm, rtranslation)
2405 
2406  return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2407  ctm[1][0], ctm[1][1],
2408  ctm[2][0], ctm[2][1]], expand)
2409 
2410  def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False):
2411  """
2412  This is similar to mergePage, but the stream to be merged is rotated
2413  and scaled by appling a transformation matrix.
2414 
2415  :param PageObject page2: the page to be merged into this one. Should be
2416  an instance of :class:`PageObject<PageObject>`.
2417  :param float rotation: The angle of the rotation, in degrees
2418  :param float scale: The scaling factor
2419  :param bool expand: Whether the page should be expanded to fit the
2420  dimensions of the page to be merged.
2421  """
2422  rotation = math.radians(rotation)
2423  rotating = [[math.cos(rotation), math.sin(rotation), 0],
2424  [-math.sin(rotation), math.cos(rotation), 0],
2425  [0, 0, 1]]
2426  scaling = [[scale, 0, 0],
2427  [0, scale, 0],
2428  [0, 0, 1]]
2429  ctm = utils.matrixMultiply(rotating, scaling)
2430 
2431  return self.mergeTransformedPage(page2,
2432  [ctm[0][0], ctm[0][1],
2433  ctm[1][0], ctm[1][1],
2434  ctm[2][0], ctm[2][1]], expand)
2435 
2436  def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False):
2437  """
2438  This is similar to mergePage, but the stream to be merged is translated
2439  and scaled by appling a transformation matrix.
2440 
2441  :param PageObject page2: the page to be merged into this one. Should be
2442  an instance of :class:`PageObject<PageObject>`.
2443  :param float scale: The scaling factor
2444  :param float tx: The translation on X axis
2445  :param float ty: The translation on Y axis
2446  :param bool expand: Whether the page should be expanded to fit the
2447  dimensions of the page to be merged.
2448  """
2449 
2450  translation = [[1, 0, 0],
2451  [0, 1, 0],
2452  [tx, ty, 1]]
2453  scaling = [[scale, 0, 0],
2454  [0, scale, 0],
2455  [0, 0, 1]]
2456  ctm = utils.matrixMultiply(scaling, translation)
2457 
2458  return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2459  ctm[1][0], ctm[1][1],
2460  ctm[2][0], ctm[2][1]], expand)
2461 
2462  def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False):
2463  """
2464  This is similar to mergePage, but the stream to be merged is translated,
2465  rotated and scaled by appling a transformation matrix.
2466 
2467  :param PageObject page2: the page to be merged into this one. Should be
2468  an instance of :class:`PageObject<PageObject>`.
2469  :param float tx: The translation on X axis
2470  :param float ty: The translation on Y axis
2471  :param float rotation: The angle of the rotation, in degrees
2472  :param float scale: The scaling factor
2473  :param bool expand: Whether the page should be expanded to fit the
2474  dimensions of the page to be merged.
2475  """
2476  translation = [[1, 0, 0],
2477  [0, 1, 0],
2478  [tx, ty, 1]]
2479  rotation = math.radians(rotation)
2480  rotating = [[math.cos(rotation), math.sin(rotation), 0],
2481  [-math.sin(rotation), math.cos(rotation), 0],
2482  [0, 0, 1]]
2483  scaling = [[scale, 0, 0],
2484  [0, scale, 0],
2485  [0, 0, 1]]
2486  ctm = utils.matrixMultiply(rotating, scaling)
2487  ctm = utils.matrixMultiply(ctm, translation)
2488 
2489  return self.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
2490  ctm[1][0], ctm[1][1],
2491  ctm[2][0], ctm[2][1]], expand)
2492 
2493 
2498  def addTransformation(self, ctm):
2499  """
2500  Applies a transformation matrix to the page.
2501 
2502  :param tuple ctm: A 6-element tuple containing the operands of the
2503  transformation matrix.
2504  """
2505  originalContent = self.getContents()
2506  if originalContent is not None:
2507  newContent = PageObject._addTransformationMatrix(
2508  originalContent, self.pdf, ctm)
2509  newContent = PageObject._pushPopGS(newContent, self.pdf)
2510  self[NameObject('/Contents')] = newContent
2511 
2512  def scale(self, sx, sy):
2513  """
2514  Scales a page by the given factors by appling a transformation
2515  matrix to its content and updating the page size.
2516 
2517  :param float sx: The scaling factor on horizontal axis.
2518  :param float sy: The scaling factor on vertical axis.
2519  """
2520  self.addTransformation([sx, 0,
2521  0, sy,
2522  0, 0])
2523  self.mediaBox = RectangleObject([
2524  float(self.mediaBox.getLowerLeft_x()) * sx,
2525  float(self.mediaBox.getLowerLeft_y()) * sy,
2526  float(self.mediaBox.getUpperRight_x()) * sx,
2527  float(self.mediaBox.getUpperRight_y()) * sy])
2528  if "/VP" in self:
2529  viewport = self["/VP"]
2530  if isinstance(viewport, ArrayObject):
2531  bbox = viewport[0]["/BBox"]
2532  else:
2533  bbox = viewport["/BBox"]
2534  scaled_bbox = RectangleObject([
2535  float(bbox[0]) * sx,
2536  float(bbox[1]) * sy,
2537  float(bbox[2]) * sx,
2538  float(bbox[3]) * sy])
2539  if isinstance(viewport, ArrayObject):
2540  self[NameObject("/VP")][NumberObject(0)][NameObject("/BBox")] = scaled_bbox
2541  else:
2542  self[NameObject("/VP")][NameObject("/BBox")] = scaled_bbox
2543 
2544  def scaleBy(self, factor):
2545  """
2546  Scales a page by the given factor by appling a transformation
2547  matrix to its content and updating the page size.
2548 
2549  :param float factor: The scaling factor (for both X and Y axis).
2550  """
2551  self.scale(factor, factor)
2552 
2553  def scaleTo(self, width, height):
2554  """
2555  Scales a page to the specified dimentions by appling a
2556  transformation matrix to its content and updating the page size.
2557 
2558  :param float width: The new width.
2559  :param float height: The new heigth.
2560  """
2561  sx = width / float(self.mediaBox.getUpperRight_x() -
2562  self.mediaBox.getLowerLeft_x ())
2563  sy = height / float(self.mediaBox.getUpperRight_y() -
2564  self.mediaBox.getLowerLeft_y ())
2565  self.scale(sx, sy)
2566 
2568  """
2569  Compresses the size of this page by joining all content streams and
2570  applying a FlateDecode filter.
2571 
2572  However, it is possible that this function will perform no action if
2573  content stream compression becomes "automatic" for some reason.
2574  """
2575  content = self.getContents()
2576  if content is not None:
2577  if not isinstance(content, ContentStream):
2578  content = ContentStream(content, self.pdf)
2579  self[NameObject("/Contents")] = content.flateEncode()
2580 
2581  def extractText(self):
2582  """
2583  Locate all text drawing commands, in the order they are provided in the
2584  content stream, and extract the text. This works well for some PDF
2585  files, but poorly for others, depending on the generator used. This will
2586  be refined in the future. Do not rely on the order of text coming out of
2587  this function, as it will change if this function is made more
2588  sophisticated.
2589 
2590  :return: a unicode string object.
2591  """
2592  text = u_("")
2593  content = self["/Contents"].getObject()
2594  if not isinstance(content, ContentStream):
2595  content = ContentStream(content, self.pdf)
2596  # Note: we check all strings are TextStringObjects. ByteStringObjects
2597  # are strings where the byte->string encoding was unknown, so adding
2598  # them to the text here would be gibberish.
2599  for operands, operator in content.operations:
2600  if operator == b_("Tj"):
2601  _text = operands[0]
2602  if isinstance(_text, TextStringObject):
2603  text += _text
2604  elif operator == b_("T*"):
2605  text += "\n"
2606  elif operator == b_("'"):
2607  text += "\n"
2608  _text = operands[0]
2609  if isinstance(_text, TextStringObject):
2610  text += operands[0]
2611  elif operator == b_('"'):
2612  _text = operands[2]
2613  if isinstance(_text, TextStringObject):
2614  text += "\n"
2615  text += _text
2616  elif operator == b_("TJ"):
2617  for i in operands[0]:
2618  if isinstance(i, TextStringObject):
2619  text += i
2620  text += "\n"
2621  return text
2622 
2623  mediaBox = createRectangleAccessor("/MediaBox", ())
2624  """
2625  A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2626  defining the boundaries of the physical medium on which the page is
2627  intended to be displayed or printed.
2628  """
2629 
2630  cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",))
2631  """
2632  A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2633  defining the visible region of default user space. When the page is
2634  displayed or printed, its contents are to be clipped (cropped) to this
2635  rectangle and then imposed on the output medium in some
2636  implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`.
2637  """
2638 
2639  bleedBox = createRectangleAccessor("/BleedBox", ("/CropBox", "/MediaBox"))
2640  """
2641  A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2642  defining the region to which the contents of the page should be clipped
2643  when output in a production enviroment.
2644  """
2645 
2646  trimBox = createRectangleAccessor("/TrimBox", ("/CropBox", "/MediaBox"))
2647  """
2648  A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2649  defining the intended dimensions of the finished page after trimming.
2650  """
2651 
2652  artBox = createRectangleAccessor("/ArtBox", ("/CropBox", "/MediaBox"))
2653  """
2654  A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2655  defining the extent of the page's meaningful content as intended by the
2656  page's creator.
2657  """
2658 
2659 
2661  def __init__(self, stream, pdf):
2662  self.pdf = pdf
2663  self.operations = []
2664  # stream may be a StreamObject or an ArrayObject containing
2665  # multiple StreamObjects to be cat'd together.
2666  stream = stream.getObject()
2667  if isinstance(stream, ArrayObject):
2668  data = b_("")
2669  for s in stream:
2670  data += s.getObject().getData()
2671  stream = BytesIO(b_(data))
2672  else:
2673  stream = BytesIO(b_(stream.getData()))
2674  self.__parseContentStream(stream)
2675 
2676  def __parseContentStream(self, stream):
2677  # file("f:\\tmp.txt", "w").write(stream.read())
2678  stream.seek(0, 0)
2679  operands = []
2680  while True:
2681  peek = readNonWhitespace(stream)
2682  if peek == b_('') or ord_(peek) == 0:
2683  break
2684  stream.seek(-1, 1)
2685  if peek.isalpha() or peek == b_("'") or peek == b_('"'):
2686  operator = utils.readUntilRegex(stream,
2687  NameObject.delimiterPattern, True)
2688  if operator == b_("BI"):
2689  # begin inline image - a completely different parsing
2690  # mechanism is required, of course... thanks buddy...
2691  assert operands == []
2692  ii = self._readInlineImage(stream)
2693  self.operations.append((ii, b_("INLINE IMAGE")))
2694  else:
2695  self.operations.append((operands, operator))
2696  operands = []
2697  elif peek == b_('%'):
2698  # If we encounter a comment in the content stream, we have to
2699  # handle it here. Typically, readObject will handle
2700  # encountering a comment -- but readObject assumes that
2701  # following the comment must be the object we're trying to
2702  # read. In this case, it could be an operator instead.
2703  while peek not in (b_('\r'), b_('\n')):
2704  peek = stream.read(1)
2705  else:
2706  operands.append(readObject(stream, None))
2707 
2708  def _readInlineImage(self, stream):
2709  # begin reading just after the "BI" - begin image
2710  # first read the dictionary of settings.
2711  settings = DictionaryObject()
2712  while True:
2713  tok = readNonWhitespace(stream)
2714  stream.seek(-1, 1)
2715  if tok == b_("I"):
2716  # "ID" - begin of image data
2717  break
2718  key = readObject(stream, self.pdf)
2719  tok = readNonWhitespace(stream)
2720  stream.seek(-1, 1)
2721  value = readObject(stream, self.pdf)
2722  settings[key] = value
2723  # left at beginning of ID
2724  tmp = stream.read(3)
2725  assert tmp[:2] == b_("ID")
2726  data = b_("")
2727  while True:
2728  # Read the inline image, while checking for EI (End Image) operator.
2729  tok = stream.read(1)
2730  if tok == b_("E"):
2731  # Check for End Image
2732  tok2 = stream.read(1)
2733  if tok2 == b_("I"):
2734  # Data can contain EI, so check for the Q operator.
2735  tok3 = stream.read(1)
2736  info = tok + tok2
2737  # We need to find whitespace between EI and Q.
2738  has_q_whitespace = False
2739  while tok3 in utils.WHITESPACES:
2740  has_q_whitespace = True
2741  info += tok3
2742  tok3 = stream.read(1)
2743  if tok3 == b_("Q") and has_q_whitespace:
2744  stream.seek(-1, 1)
2745  break
2746  else:
2747  stream.seek(-1,1)
2748  data += info
2749  else:
2750  stream.seek(-1, 1)
2751  data += tok
2752  else:
2753  data += tok
2754  return {"settings": settings, "data": data}
2755 
2756  def _getData(self):
2757  newdata = BytesIO()
2758  for operands, operator in self.operations:
2759  if operator == b_("INLINE IMAGE"):
2760  newdata.write(b_("BI"))
2761  dicttext = BytesIO()
2762  operands["settings"].writeToStream(dicttext, None)
2763  newdata.write(dicttext.getvalue()[2:-2])
2764  newdata.write(b_("ID "))
2765  newdata.write(operands["data"])
2766  newdata.write(b_("EI"))
2767  else:
2768  for op in operands:
2769  op.writeToStream(newdata, None)
2770  newdata.write(b_(" "))
2771  newdata.write(b_(operator))
2772  newdata.write(b_("\n"))
2773  return newdata.getvalue()
2774 
2775  def _setData(self, value):
2776  self.__parseContentStream(BytesIO(b_(value)))
2777 
2778  _data = property(_getData, _setData)
2779 
2780 
2782  """
2783  A class representing the basic document metadata provided in a PDF File.
2784  This class is accessible through
2785  :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>`
2786 
2787  All text properties of the document metadata have
2788  *two* properties, eg. author and author_raw. The non-raw property will
2789  always return a ``TextStringObject``, making it ideal for a case where
2790  the metadata is being displayed. The raw property can sometimes return
2791  a ``ByteStringObject``, if PyPDF2 was unable to decode the string's
2792  text encoding; this requires additional safety in the caller and
2793  therefore is not as commonly accessed.
2794  """
2795 
2796  def __init__(self):
2797  DictionaryObject.__init__(self)
2798 
2799  def getText(self, key):
2800  retval = self.get(key, None)
2801  if isinstance(retval, TextStringObject):
2802  return retval
2803  return None
2804 
2805  title = property(lambda self: self.getText("/Title"))
2806  """Read-only property accessing the document's **title**.
2807  Returns a unicode string (``TextStringObject``) or ``None``
2808  if the title is not specified."""
2809  title_raw = property(lambda self: self.get("/Title"))
2810  """The "raw" version of title; can return a ``ByteStringObject``."""
2811 
2812  author = property(lambda self: self.getText("/Author"))
2813  """Read-only property accessing the document's **author**.
2814  Returns a unicode string (``TextStringObject``) or ``None``
2815  if the author is not specified."""
2816  author_raw = property(lambda self: self.get("/Author"))
2817  """The "raw" version of author; can return a ``ByteStringObject``."""
2818 
2819  subject = property(lambda self: self.getText("/Subject"))
2820  """Read-only property accessing the document's **subject**.
2821  Returns a unicode string (``TextStringObject``) or ``None``
2822  if the subject is not specified."""
2823  subject_raw = property(lambda self: self.get("/Subject"))
2824  """The "raw" version of subject; can return a ``ByteStringObject``."""
2825 
2826  creator = property(lambda self: self.getText("/Creator"))
2827  """Read-only property accessing the document's **creator**. If the
2828  document was converted to PDF from another format, this is the name of the
2829  application (e.g. OpenOffice) that created the original document from
2830  which it was converted. Returns a unicode string (``TextStringObject``)
2831  or ``None`` if the creator is not specified."""
2832  creator_raw = property(lambda self: self.get("/Creator"))
2833  """The "raw" version of creator; can return a ``ByteStringObject``."""
2834 
2835  producer = property(lambda self: self.getText("/Producer"))
2836  """Read-only property accessing the document's **producer**.
2837  If the document was converted to PDF from another format, this is
2838  the name of the application (for example, OSX Quartz) that converted
2839  it to PDF. Returns a unicode string (``TextStringObject``)
2840  or ``None`` if the producer is not specified."""
2841  producer_raw = property(lambda self: self.get("/Producer"))
2842  """The "raw" version of producer; can return a ``ByteStringObject``."""
2843 
2844 
2845 def convertToInt(d, size):
2846  if size > 8:
2847  raise utils.PdfReadError("invalid size in convertToInt")
2848  d = b_("\x00\x00\x00\x00\x00\x00\x00\x00") + b_(d)
2849  d = d[-8:]
2850  return struct.unpack(">q", d)[0]
2851 
2852 # ref: pdf1.8 spec section 3.5.2 algorithm 3.2
2853 _encryption_padding = b_('\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
2854  b_('\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
2855  b_('\xa9\xfe\x64\x53\x69\x7a')
2856 
2857 
2858 # Implementation of algorithm 3.2 of the PDF standard security handler,
2859 # section 3.5.2 of the PDF 1.6 reference.
2860 def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
2861  # 1. Pad or truncate the password string to exactly 32 bytes. If the
2862  # password string is more than 32 bytes long, use only its first 32 bytes;
2863  # if it is less than 32 bytes long, pad it by appending the required number
2864  # of additional bytes from the beginning of the padding string
2865  # (_encryption_padding).
2866  password = b_((str_(password) + str_(_encryption_padding))[:32])
2867  # 2. Initialize the MD5 hash function and pass the result of step 1 as
2868  # input to this function.
2869  import struct
2870  m = md5(password)
2871  # 3. Pass the value of the encryption dictionary's /O entry to the MD5 hash
2872  # function.
2873  m.update(owner_entry.original_bytes)
2874  # 4. Treat the value of the /P entry as an unsigned 4-byte integer and pass
2875  # these bytes to the MD5 hash function, low-order byte first.
2876  p_entry = struct.pack('<i', p_entry)
2877  m.update(p_entry)
2878  # 5. Pass the first element of the file's file identifier array to the MD5
2879  # hash function.
2880  m.update(id1_entry.original_bytes)
2881  # 6. (Revision 3 or greater) If document metadata is not being encrypted,
2882  # pass 4 bytes with the value 0xFFFFFFFF to the MD5 hash function.
2883  if rev >= 3 and not metadata_encrypt:
2884  m.update(b_("\xff\xff\xff\xff"))
2885  # 7. Finish the hash.
2886  md5_hash = m.digest()
2887  # 8. (Revision 3 or greater) Do the following 50 times: Take the output
2888  # from the previous MD5 hash and pass the first n bytes of the output as
2889  # input into a new MD5 hash, where n is the number of bytes of the
2890  # encryption key as defined by the value of the encryption dictionary's
2891  # /Length entry.
2892  if rev >= 3:
2893  for i in range(50):
2894  md5_hash = md5(md5_hash[:keylen]).digest()
2895  # 9. Set the encryption key to the first n bytes of the output from the
2896  # final MD5 hash, where n is always 5 for revision 2 but, for revision 3 or
2897  # greater, depends on the value of the encryption dictionary's /Length
2898  # entry.
2899  return md5_hash[:keylen]
2900 
2901 
2902 # Implementation of algorithm 3.3 of the PDF standard security handler,
2903 # section 3.5.2 of the PDF 1.6 reference.
2904 def _alg33(owner_pwd, user_pwd, rev, keylen):
2905  # steps 1 - 4
2906  key = _alg33_1(owner_pwd, rev, keylen)
2907  # 5. Pad or truncate the user password string as described in step 1 of
2908  # algorithm 3.2.
2909  user_pwd = b_((user_pwd + str_(_encryption_padding))[:32])
2910  # 6. Encrypt the result of step 5, using an RC4 encryption function with
2911  # the encryption key obtained in step 4.
2912  val = utils.RC4_encrypt(key, user_pwd)
2913  # 7. (Revision 3 or greater) Do the following 19 times: Take the output
2914  # from the previous invocation of the RC4 function and pass it as input to
2915  # a new invocation of the function; use an encryption key generated by
2916  # taking each byte of the encryption key obtained in step 4 and performing
2917  # an XOR operation between that byte and the single-byte value of the
2918  # iteration counter (from 1 to 19).
2919  if rev >= 3:
2920  for i in range(1, 20):
2921  new_key = ''
2922  for l in range(len(key)):
2923  new_key += chr(ord_(key[l]) ^ i)
2924  val = utils.RC4_encrypt(new_key, val)
2925  # 8. Store the output from the final invocation of the RC4 as the value of
2926  # the /O entry in the encryption dictionary.
2927  return val
2928 
2929 
2930 # Steps 1-4 of algorithm 3.3
2931 def _alg33_1(password, rev, keylen):
2932  # 1. Pad or truncate the owner password string as described in step 1 of
2933  # algorithm 3.2. If there is no owner password, use the user password
2934  # instead.
2935  password = b_((password + str_(_encryption_padding))[:32])
2936  # 2. Initialize the MD5 hash function and pass the result of step 1 as
2937  # input to this function.
2938  m = md5(password)
2939  # 3. (Revision 3 or greater) Do the following 50 times: Take the output
2940  # from the previous MD5 hash and pass it as input into a new MD5 hash.
2941  md5_hash = m.digest()
2942  if rev >= 3:
2943  for i in range(50):
2944  md5_hash = md5(md5_hash).digest()
2945  # 4. Create an RC4 encryption key using the first n bytes of the output
2946  # from the final MD5 hash, where n is always 5 for revision 2 but, for
2947  # revision 3 or greater, depends on the value of the encryption
2948  # dictionary's /Length entry.
2949  key = md5_hash[:keylen]
2950  return key
2951 
2952 
2953 # Implementation of algorithm 3.4 of the PDF standard security handler,
2954 # section 3.5.2 of the PDF 1.6 reference.
2955 def _alg34(password, owner_entry, p_entry, id1_entry):
2956  # 1. Create an encryption key based on the user password string, as
2957  # described in algorithm 3.2.
2958  key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
2959  # 2. Encrypt the 32-byte padding string shown in step 1 of algorithm 3.2,
2960  # using an RC4 encryption function with the encryption key from the
2961  # preceding step.
2962  U = utils.RC4_encrypt(key, _encryption_padding)
2963  # 3. Store the result of step 2 as the value of the /U entry in the
2964  # encryption dictionary.
2965  return U, key
2966 
2967 
2968 # Implementation of algorithm 3.4 of the PDF standard security handler,
2969 # section 3.5.2 of the PDF 1.6 reference.
2970 def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
2971  # 1. Create an encryption key based on the user password string, as
2972  # described in Algorithm 3.2.
2973  key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
2974  # 2. Initialize the MD5 hash function and pass the 32-byte padding string
2975  # shown in step 1 of Algorithm 3.2 as input to this function.
2976  m = md5()
2977  m.update(_encryption_padding)
2978  # 3. Pass the first element of the file's file identifier array (the value
2979  # of the ID entry in the document's trailer dictionary; see Table 3.13 on
2980  # page 73) to the hash function and finish the hash. (See implementation
2981  # note 25 in Appendix H.)
2982  m.update(id1_entry.original_bytes)
2983  md5_hash = m.digest()
2984  # 4. Encrypt the 16-byte result of the hash, using an RC4 encryption
2985  # function with the encryption key from step 1.
2986  val = utils.RC4_encrypt(key, md5_hash)
2987  # 5. Do the following 19 times: Take the output from the previous
2988  # invocation of the RC4 function and pass it as input to a new invocation
2989  # of the function; use an encryption key generated by taking each byte of
2990  # the original encryption key (obtained in step 2) and performing an XOR
2991  # operation between that byte and the single-byte value of the iteration
2992  # counter (from 1 to 19).
2993  for i in range(1, 20):
2994  new_key = b_('')
2995  for l in range(len(key)):
2996  new_key += b_(chr(ord_(key[l]) ^ i))
2997  val = utils.RC4_encrypt(new_key, val)
2998  # 6. Append 16 bytes of arbitrary padding to the output from the final
2999  # invocation of the RC4 function and store the 32-byte result as the value
3000  # of the U entry in the encryption dictionary.
3001  # (implementator note: I don't know what "arbitrary padding" is supposed to
3002  # mean, so I have used null bytes. This seems to match a few other
3003  # people's implementations)
3004  return val + (b_('\x00') * 16), key
generic.TreeObject
Definition: generic.py:644
pdf.PdfFileWriter.addMetadata
def addMetadata(self, infos)
Definition: pdf.py:529
generic.DictionaryObject
Definition: generic.py:497
pdf.PdfFileReader._decryptObject
def _decryptObject(self, obj, key)
Definition: pdf.py:1636
pdf.PdfFileWriter.getOutlineRoot
def getOutlineRoot(self)
Definition: pdf.py:602
utils.PdfStreamError
Definition: utils.py:221
utils.u_
def u_(s)
Definition: utils.py:244
pdf.PageObject
Definition: pdf.py:2089
pdf.PdfFileWriter.insertBlankPage
def insertBlankPage(self, width=None, height=None, index=0)
Definition: pdf.py:190
utils.RC4_encrypt
def RC4_encrypt(key, plaintext)
Definition: utils.py:168
pdf.PdfFileWriter._pages
_pages
Definition: pdf.py:95
pdf.PdfFileReader._getPageNumberByIndirect
def _getPageNumberByIndirect(self, indirectRef)
Definition: pdf.py:1379
utils.isString
def isString(s)
Definition: utils.py:52
utils.formatWarning
def formatWarning(message, category, filename, lineno, line=None)
Definition: utils.py:68
pdf.PageObject.mergeScaledPage
def mergeScaledPage(self, page2, scale, expand=False)
Definition: pdf.py:2330
pdf.PdfFileWriter.getNamedDestRoot
def getNamedDestRoot(self)
Definition: pdf.py:616
pdf.PageObject.getContents
def getContents(self)
Definition: pdf.py:2216
pdf.PdfFileWriter._encrypt
_encrypt
Definition: pdf.py:442
generic.RectangleObject
Definition: generic.py:852
pdf.PdfFileWriter.removeText
def removeText(self, ignoreByteStringObject=False)
Definition: pdf.py:839
generic.BooleanObject
Definition: generic.py:119
pdf.PdfFileReader.trailer
trailer
Definition: pdf.py:1718
pdf.PdfFileWriter
Definition: pdf.py:79
pdf.PdfFileReader._buildDestination
def _buildDestination(self, title, array)
Definition: pdf.py:1422
utils.skipOverComment
def skipOverComment(stream)
Definition: utils.py:112
pdf.PdfFileReader.getIsEncrypted
def getIsEncrypted(self)
Definition: pdf.py:2043
pdf.PageObject._rotate
def _rotate(self, angle)
Definition: pdf.py:2164
pdf.PdfFileReader._buildField
def _buildField(self, field, retval, fileobj, fieldAttributes)
Definition: pdf.py:1231
pdf.PageObject.pdf
pdf
Definition: pdf.py:2104
pdf.convertToInt
def convertToInt(d, size)
Definition: pdf.py:2845
pdf.PdfFileReader.cacheIndirectObject
def cacheIndirectObject(self, generation, idnum, obj)
Definition: pdf.py:1676
pdf.PdfFileWriter._encrypt_key
_encrypt_key
Definition: pdf.py:443
pdf.PdfFileReader.getPageNumber
def getPageNumber(self, page)
Definition: pdf.py:1395
pdf.PdfFileReader.flattenedPages
flattenedPages
ensure that we're not trying to access an encrypted PDF assert not self.trailer.has_key("/Encrypt")
Definition: pdf.py:1074
pdf.PdfFileWriter.addBlankPage
def addBlankPage(self, width=None, height=None)
Definition: pdf.py:172
pdf.PdfFileWriter.addBookmarkDestination
def addBookmarkDestination(self, dest, parent=None)
Definition: pdf.py:651
utils.skipOverWhitespace
def skipOverWhitespace(stream)
Definition: utils.py:99
utils.PdfReadError
Definition: utils.py:209
pdf.PdfFileReader.getPage
def getPage(self, pageNumber)
Definition: pdf.py:1164
pdf.PdfFileReader._pageId2Num
_pageId2Num
Definition: pdf.py:1077
pdf.PdfFileReader.getPageMode
def getPageMode(self)
Definition: pdf.py:1478
pdf.PdfFileReader._writeField
def _writeField(self, fileobj, field, fieldAttributes)
Definition: pdf.py:1252
pdf.PageObject.rotateClockwise
def rotateClockwise(self, angle)
Definition: pdf.py:2142
pdf.PdfFileWriter.addNamedDestinationObject
def addNamedDestinationObject(self, dest)
Definition: pdf.py:751
pdf.PdfFileWriter.removeLinks
def removeLinks(self)
Definition: pdf.py:774
pdf.PdfFileReader.decrypt
def decrypt(self, password)
Definition: pdf.py:1965
pdf.createRectangleAccessor
def createRectangleAccessor(name, fallback)
Definition: pdf.py:2080
pdf.PdfFileReader.isEncrypted
isEncrypted
Definition: pdf.py:2046
pdf.PdfFileReader._pairs
def _pairs(self, array)
Definition: pdf.py:1922
pdf.PageObject.mergeRotatedScaledTranslatedPage
def mergeRotatedScaledTranslatedPage(self, page2, rotation, scale, tx, ty, expand=False)
Definition: pdf.py:2462
utils.readUntilWhitespace
def readUntilWhitespace(stream, maxchars=None)
Definition: utils.py:73
pdf.PageObject.rotateCounterClockwise
def rotateCounterClockwise(self, angle)
Definition: pdf.py:2153
pdf.PageObject._mergePage
def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False)
Definition: pdf.py:2241
pdf.getRectangle
def getRectangle(self, name, defaults)
Definition: pdf.py:2054
pdf.PdfFileReader._namedDests
_namedDests
Definition: pdf.py:1355
pdf.PdfFileReader.getNumPages
def getNumPages(self)
Definition: pdf.py:1131
pdf.PdfFileWriter.cloneDocumentFromReader
def cloneDocumentFromReader(self, reader, after_page_append=None)
Definition: pdf.py:382
utils.b_
def b_(s)
Definition: utils.py:226
pdf.PdfFileWriter._root_object
_root_object
Definition: pdf.py:111
pdf.PdfFileWriter._objects
_objects
Definition: pdf.py:86
pdf.PdfFileWriter.stack
stack
Definition: pdf.py:480
pdf.PdfFileWriter.getPageMode
def getPageMode(self)
Definition: pdf.py:1005
generic.FloatObject
Definition: generic.py:226
pdf.PdfFileWriter._header
_header
Definition: pdf.py:85
pdf.PdfFileReader.getDocumentInfo
def getDocumentInfo(self)
Definition: pdf.py:1089
pdf.PdfFileReader.strict
strict
Definition: pdf.py:1073
pdf.PageObject.compressContentStreams
def compressContentStreams(self)
Definition: pdf.py:2567
generic.IndirectObject
Definition: generic.py:171
pdf.DocumentInformation.__init__
def __init__(self)
Definition: pdf.py:2796
utils.PageSizeNotDefinedError
Definition: utils.py:213
pdf.PdfFileReader.getObject
def getObject(self, indirectReference)
Definition: pdf.py:1584
pdf.PageObject.createBlankPage
createBlankPage
Definition: pdf.py:2140
pdf.PdfFileWriter._info
_info
Definition: pdf.py:102
pdf.PdfFileWriter.getReference
def getReference(self, obj)
Definition: pdf.py:596
pdf.PageObject.mediaBox
mediaBox
Definition: pdf.py:2623
pdf.PdfFileReader._getObjectFromStream
def _getObjectFromStream(self, indirectReference)
Definition: pdf.py:1531
pdf.PdfFileWriter.addNamedDestination
def addNamedDestination(self, title, pagenum)
Definition: pdf.py:759
pdf.PdfFileWriter.updatePageFormFieldValues
def updatePageFormFieldValues(self, page, fields)
Definition: pdf.py:354
pdf.PageObject.scaleBy
def scaleBy(self, factor)
Definition: pdf.py:2544
pdf.PdfFileReader.read
def read(self, stream)
Definition: pdf.py:1685
generic.Field
Definition: generic.py:952
pdf.PdfFileWriter.removeImages
def removeImages(self, ignoreByteStringObject=False)
Definition: pdf.py:784
pdf.PdfFileReader.cacheGetIndirectObject
def cacheGetIndirectObject(self, generation, idnum)
Definition: pdf.py:1669
pdf.setRectangle
def setRectangle(self, name, value)
Definition: pdf.py:2070
pdf.PdfFileReader._checkKids
def _checkKids(self, tree, retval, fileobj)
Definition: pdf.py:1246
generic.readObject
def readObject(stream, pdf)
Definition: generic.py:54
pdf.PdfFileWriter._root
_root
Definition: pdf.py:110
pdf.PdfFileWriter._addObject
def _addObject(self, obj)
Definition: pdf.py:113
pdf.PdfFileReader.getOutlines
def getOutlines(self, node=None, outlines=None)
Definition: pdf.py:1333
pdf.PageObject.extractText
def extractText(self)
Definition: pdf.py:2581
generic.StreamObject.writeToStream
def writeToStream(self, stream, encryption_key)
Definition: generic.py:780
pdf.PdfFileReader.xref
xref
Definition: pdf.py:1716
pdf.PdfFileWriter.getNumPages
def getNumPages(self)
Definition: pdf.py:164
pdf.PageObject.indirectRef
indirectRef
Definition: pdf.py:2105
pdf.PdfFileWriter.addJS
def addJS(self, javascript)
Definition: pdf.py:214
generic.NullObject
Definition: generic.py:107
pdf.PdfFileWriter.setPageLayout
def setPageLayout(self, layout)
Definition: pdf.py:978
pdf.PageObject.addTransformation
def addTransformation(self, ctm)
Applys a transformation matrix the page.
Definition: pdf.py:2498
pdf.PdfFileReader.getPageLayout
def getPageLayout(self)
Definition: pdf.py:1460
pdf.PdfFileWriter.appendPagesFromReader
def appendPagesFromReader(self, reader, after_page_append=None)
Definition: pdf.py:328
pdf.PdfFileWriter.addBookmarkDict
def addBookmarkDict(self, bookmark, parent=None)
Definition: pdf.py:665
pdf.PdfFileReader.xrefIndex
xrefIndex
Definition: pdf.py:1076
generic.NameObject
Definition: generic.py:467
generic.PdfObject.getObject
def getObject(self)
Definition: generic.py:102
pdf.PdfFileReader.__init__
def __init__(self, stream, strict=True, warndest=None, overwriteWarnings=True)
Definition: pdf.py:1061
utils.ConvertFunctionsToVirtualList
Definition: utils.py:144
pdf.PdfFileReader.pages
pages
Definition: pdf.py:1452
pdf.PdfFileWriter.write
def write(self, stream)
Definition: pdf.py:445
utils.matrixMultiply
def matrixMultiply(a, b)
Definition: utils.py:185
generic.DecodedStreamObject.getData
def getData(self)
Definition: generic.py:822
pdf.PdfFileReader._override_encryption
_override_encryption
Definition: pdf.py:1087
pdf.deleteRectangle
def deleteRectangle(self, name)
Definition: pdf.py:2076
generic.NumberObject
Definition: generic.py:251
pdf.PdfFileReader._flatten
def _flatten(self, pages=None, inherit=None, indirectRef=None)
Definition: pdf.py:1496
pdf.PdfFileReader.getFields
def getFields(self, tree=None, retval=None, fileobj=None)
Definition: pdf.py:1189
pdf.PdfFileWriter._valid_modes
list _valid_modes
Definition: pdf.py:1003
pdf.PdfFileReader._buildOutline
def _buildOutline(self, node)
Definition: pdf.py:1427
pdf.BytesIO
BytesIO
Definition: pdf.py:57
pdf.PdfFileWriter.getObject
def getObject(self, ido)
Definition: pdf.py:117
pdf.PdfFileReader.xref_objStm
xref_objStm
Definition: pdf.py:1717
utils.readNonWhitespace
def readNonWhitespace(stream)
Definition: utils.py:89
pdf.PdfFileReader.getNamedDestinations
def getNamedDestinations(self, tree=None, retval=None)
Definition: pdf.py:1286
pdf.PdfFileWriter.setPageMode
def setPageMode(self, mode)
Definition: pdf.py:1019
pdf.ContentStream.__parseContentStream
def __parseContentStream(self, stream)
Definition: pdf.py:2676
pdf.PdfFileReader.getXmpMetadata
def getXmpMetadata(self)
Definition: pdf.py:1109
pdf.PdfFileWriter._ID
_ID
Definition: pdf.py:427
pdf.PageObject.__init__
def __init__(self, pdf=None, indirectRef=None)
Definition: pdf.py:2102
pdf.PdfFileWriter.getPage
def getPage(self, pageNumber)
Definition: pdf.py:151
pdf.ContentStream.__init__
def __init__(self, stream, pdf)
Definition: pdf.py:2661
generic.createStringObject
def createStringObject(string)
Given a string (either a "str" or "unicode"), create a ByteStringObject or a TextStringObject to repr...
Definition: generic.py:280
pdf.PdfFileReader._authenticateUserPassword
def _authenticateUserPassword(self, password)
Definition: pdf.py:2025
pdf.PdfFileWriter._valid_layouts
list _valid_layouts
Definition: pdf.py:963
generic.Destination
Definition: generic.py:1024
pdf.PageObject.mergeScaledTranslatedPage
def mergeScaledTranslatedPage(self, page2, scale, tx, ty, expand=False)
Definition: pdf.py:2436
pdf.PdfFileReader.readNextEndLine
def readNextEndLine(self, stream)
Definition: pdf.py:1930
pdf.PdfFileWriter.addLink
def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args)
Definition: pdf.py:893
generic.ArrayObject
Definition: generic.py:141
pdf.PageObject.mergeTransformedPage
def mergeTransformedPage(self, page2, ctm, expand=False)
Definition: pdf.py:2315
utils.PdfReadWarning
Definition: utils.py:217
pdf.ContentStream._readInlineImage
def _readInlineImage(self, stream)
Definition: pdf.py:2708
pdf.PageObject.mergePage
def mergePage(self, page2)
Definition: pdf.py:2228
pdf.PageObject.scale
def scale(self, sx, sy)
Definition: pdf.py:2512
pdf.ContentStream.pdf
pdf
Definition: pdf.py:2662
generic.DecodedStreamObject
Definition: generic.py:821
pdf.PdfFileReader
Definition: pdf.py:1044
pdf.PdfFileWriter._sweepIndirectReferences
def _sweepIndirectReferences(self, externMap, data)
Definition: pdf.py:541
pdf.PdfFileWriter.addAttachment
def addAttachment(self, fname, fdata)
Definition: pdf.py:247
pdf.PageObject.mergeRotatedPage
def mergeRotatedPage(self, page2, rotation, expand=False)
Definition: pdf.py:2346
pdf.PdfFileReader.readObjectHeader
def readObjectHeader(self, stream)
Definition: pdf.py:1649
generic.TextStringObject
Represents a string object that has been decoded into a real unicode string.
Definition: generic.py:421
utils.readUntilRegex
def readUntilRegex(stream, regex, ignore_eof=False)
Definition: utils.py:120
pdf.PdfFileWriter.getPageLayout
def getPageLayout(self)
Definition: pdf.py:965
pdf.PdfFileWriter.encrypt
def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True)
Definition: pdf.py:398
pdf.PageObject.scaleTo
def scaleTo(self, width, height)
Definition: pdf.py:2553
pdf.PdfFileWriter.__init__
def __init__(self)
Definition: pdf.py:84
pdf.PdfFileWriter.addPage
def addPage(self, page)
Definition: pdf.py:130
pdf.PdfFileReader.stream
stream
Definition: pdf.py:1085
pdf.PdfFileWriter._addPage
def _addPage(self, page, action)
Definition: pdf.py:122
pdf.PdfFileReader._zeroXref
def _zeroXref(self, generation)
Definition: pdf.py:1919
pdf.DocumentInformation.getText
def getText(self, key)
Definition: pdf.py:2799
pdf.PageObject.mergeRotatedScaledPage
def mergeRotatedScaledPage(self, page2, rotation, scale, expand=False)
Definition: pdf.py:2410
pdf.PdfFileReader._decrypt
def _decrypt(self, password)
Definition: pdf.py:1991
pdf.PdfFileWriter.addBookmark
def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args)
Definition: pdf.py:690
pdf.PdfFileWriter.cloneReaderDocumentRoot
def cloneReaderDocumentRoot(self, reader)
Definition: pdf.py:373
pdf.PdfFileReader.resolvedObjects
resolvedObjects
Definition: pdf.py:1075
pdf.PdfFileReader.getFormTextFields
def getFormTextFields(self)
Definition: pdf.py:1276
pdf.ContentStream.operations
operations
Definition: pdf.py:2663
pdf.PdfFileReader.getDestinationPageNumber
def getDestinationPageNumber(self, destination)
Definition: pdf.py:1408
pdf.DocumentInformation
Definition: pdf.py:2781
utils.str_
def str_(b)
Definition: utils.py:251
utils.ord_
def ord_(b)
Definition: utils.py:261
pdf.PdfFileWriter.insertPage
def insertPage(self, page, index=0)
Definition: pdf.py:140
pdf.PageObject.mergeTranslatedPage
def mergeTranslatedPage(self, page2, tx, ty, expand=False)
Definition: pdf.py:2363
pdf.PdfFileReader._decryption_key
_decryption_key
Definition: pdf.py:1999
generic.ByteStringObject
Represents a string object where the text encoding could not be determined.
Definition: generic.py:400
pdf.ContentStream
Definition: pdf.py:2660
pdf.PageObject.mergeRotatedTranslatedPage
def mergeRotatedTranslatedPage(self, page2, rotation, tx, ty, expand=False)
Definition: pdf.py:2379