35 A pure-Python PDF library with an increasing number of capabilities.
36 See README for links to FAQ, documentation, homepage, etc.
39 __author__ =
"Mathieu Fenniak"
40 __author_email__ =
"biziqe@mathieu.fenniak.net"
42 __maintainer__ =
"Phaseit, Inc."
43 __maintainer_email =
"PyPDF2@phaseit.net"
50 from sys
import version_info
51 if version_info < ( 3, 0 ):
52 from cStringIO
import StringIO
54 from io
import StringIO
56 if version_info < ( 3, 0 ):
59 from io
import BytesIO
65 from .generic
import *
66 from .utils
import readNonWhitespace, readUntilWhitespace, ConvertFunctionsToVirtualList
67 from .utils
import isString, b_, u_, ord_, chr_, str_, formatWarning
69 if version_info < ( 2, 4 ):
70 from sets
import ImmutableSet
as frozenset
72 if version_info < ( 2, 5 ):
75 from hashlib
import md5
81 This class supports writing PDF files out, given pages produced by another
82 class (typically :class:`PdfFileReader<PdfFileReader>`).
113 def _addObject(self, obj):
119 raise ValueError(
"pdf must be self")
122 def _addPage(self, page, action):
123 assert page[
"/Type"] ==
"/Page"
127 action(pages[
"/Kids"], page)
132 Adds a page to this PDF file. The page is usually acquired from a
133 :class:`PdfFileReader<PdfFileReader>` instance.
135 :param PageObject page: The page to add to the document. Should be
136 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
142 Insert a page in this PDF file. The page is usually acquired from a
143 :class:`PdfFileReader<PdfFileReader>` instance.
145 :param PageObject page: The page to add to the document. This
146 argument should be an instance of :class:`PageObject<pdf.PageObject>`.
147 :param int index: Position at which the page will be inserted.
149 self.
_addPage(page,
lambda l, p: l.insert(index, p))
153 Retrieves a page by number from this PDF file.
155 :param int pageNumber: The page number to retrieve
156 (pages begin at zero)
157 :return: the page at the index given by *pageNumber*
158 :rtype: :class:`PageObject<pdf.PageObject>`
162 return pages[
"/Kids"][pageNumber].
getObject()
166 :return: the number of pages.
174 Appends a blank page to this PDF file and returns it. If no page size
175 is specified, use the size of the last page.
177 :param float width: The width of the new page expressed in default user
179 :param float height: The height of the new page expressed in default
181 :return: the newly appended page
182 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
183 :raises PageSizeNotDefinedError: if width and height are not defined
184 and previous page does not exist.
186 page = PageObject.createBlankPage(self, width, height)
192 Inserts a blank page to this PDF file and returns it. If no page size
193 is specified, use the size of the last page.
195 :param float width: The width of the new page expressed in default user
197 :param float height: The height of the new page expressed in default
199 :param int index: Position to add the page.
200 :return: the newly appended page
201 :rtype: :class:`PageObject<PyPDF2.pdf.PageObject>`
202 :raises PageSizeNotDefinedError: if width and height are not defined
203 and previous page does not exist.
205 if width
is None or height
is None and \
208 width = oldpage.mediaBox.getWidth()
209 height = oldpage.mediaBox.getHeight()
210 page = PageObject.createBlankPage(self, width, height)
216 Add Javascript which will launch upon opening this PDF.
218 :param str javascript: Your Javascript.
220 >>> output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")
221 # Example: This will launch the print window when the PDF is opened.
232 js_string_name = str(uuid.uuid4())
235 js_name_tree.update({
243 NameObject(
"/OpenAction"): js_indirect_object,
249 Embed a file inside the PDF.
251 :param str fname: The filename to display.
252 :param str fdata: The data in the file.
255 https://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
278 file_entry.setData(fdata)
293 efEntry.update({
NameObject(
"/F"):file_entry })
309 /Names << /EmbeddedFiles << /Names [(hello.txt) 7 0 R] >> >>
315 embeddedFilesNamesDictionary.update({
320 embeddedFilesDictionary.update({
321 NameObject(
"/EmbeddedFiles"): embeddedFilesNamesDictionary
330 Copy pages from reader to writer. Includes an optional callback parameter
331 which is invoked after pages are appended to the writer.
333 :param reader: a PdfFileReader object from which to copy page
334 annotations to this writer object. The writer's annots
336 :callback after_page_append (function): Callback function that is invoked after
337 each page is appended to the writer. Callback signature:
339 :param writer_pageref (PDF page reference): Reference to the page
340 appended to the writer.
343 reader_num_pages = reader.getNumPages()
347 for rpagenum
in range(0, reader_num_pages):
348 reader_page = reader.getPage(rpagenum)
350 writer_page = self.
getPage(writer_num_pages+rpagenum)
352 if callable(after_page_append): after_page_append(writer_page)
356 Update the form field values for a given page from a fields dictionary.
357 Copy field texts and values from fields to page.
359 :param page: Page reference from PDF writer where the annotations
360 and field data will be updated.
361 :param fields: a Python dictionary of field names (/T) and text
365 for j
in range(0, len(page[
'/Annots'])):
366 writer_annot = page[
'/Annots'][j].
getObject()
368 if writer_annot.get(
'/T') == field:
369 writer_annot.update({
375 Copy the reader document root to the writer.
377 :param reader: PdfFileReader from the document root should be copied.
378 :callback after_page_append
384 Create a copy (clone) of a document from a PDF file reader
386 :param reader: PDF file reader instance from which the clone
388 :callback after_page_append (function): Callback function that is invoked after
389 each page is appended to the writer. Signature includes a reference to the
390 appended page (delegates to appendPagesFromReader). Callback signature:
392 :param writer_pageref (PDF page reference): Reference to the page just
393 appended to the document.
398 def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
400 Encrypt this PDF file with the PDF Standard encryption handler.
402 :param str user_pwd: The "user password", which allows for opening
403 and reading the PDF file with the restrictions provided.
404 :param str owner_pwd: The "owner password", which allows for
405 opening the PDF files without any restrictions. By default,
406 the owner password is the same as the user password.
407 :param bool use_128bit: flag as to whether to use 128bit
408 encryption. When false, 40bit encryption will be used. By default,
412 if owner_pwd ==
None:
417 keylen = int(128 / 8)
429 U, key = _alg34(user_pwd, O, P, ID_1)
432 U, key = _alg35(user_pwd, rev, keylen, O, P, ID_1,
False)
447 Writes the collection of pages added to this object out as a PDF file.
449 :param stream: An object to write the file to. The object must support
450 the write method and the tell method, similar to a file object.
452 if hasattr(stream,
'mode')
and 'b' not in stream.mode:
453 warnings.warn(
"File <%s> to write to is not in binary mode. It may not be written to correctly." % stream.name)
460 externalReferenceMap = {}
470 for objIndex
in range(len(self.
_objects)):
472 if isinstance(obj, PageObject)
and obj.indirectRef !=
None:
473 data = obj.indirectRef
474 if data.pdf
not in externalReferenceMap:
475 externalReferenceMap[data.pdf] = {}
476 if data.generation
not in externalReferenceMap[data.pdf]:
477 externalReferenceMap[data.pdf][data.generation] = {}
478 externalReferenceMap[data.pdf][data.generation][data.idnum] =
IndirectObject(objIndex + 1, 0, self)
481 if debug: print((
"ERM:", externalReferenceMap,
"root:", self.
_root))
486 object_positions = []
491 object_positions.append(stream.tell())
492 stream.write(
b_(str(idnum) +
" 0 obj\n"))
494 if hasattr(self,
"_encrypt")
and idnum != self.
_encrypt.idnum:
495 pack1 = struct.pack(
"<i", i + 1)[:3]
496 pack2 = struct.pack(
"<i", 0)[:2]
499 md5_hash = md5(key).digest()
501 obj.writeToStream(stream, key)
502 stream.write(
b_(
"\nendobj\n"))
505 xref_location = stream.tell()
506 stream.write(
b_(
"xref\n"))
507 stream.write(
b_(
"0 %s\n" % (len(self.
_objects) + 1)))
508 stream.write(
b_(
"%010d %05d f \n" % (0, 65535)))
509 for offset
in object_positions:
510 stream.write(
b_(
"%010d %05d n \n" % (offset, 0)))
513 stream.write(
b_(
"trailer\n"))
520 if hasattr(self,
"_ID"):
522 if hasattr(self,
"_encrypt"):
524 trailer.writeToStream(stream,
None)
527 stream.write(
b_(
"\nstartxref\n%s\n%%%%EOF\n" % (xref_location)))
531 Add custom metadata to the output.
533 :param dict infos: a Python dictionary where each key is a field
534 and each value is your new metadata.
537 for key, value
in list(infos.items()):
541 def _sweepIndirectReferences(self, externMap, data):
543 if debug: print((data,
"TYPE", data.__class__.__name__))
544 if isinstance(data, DictionaryObject):
545 for key, value
in list(data.items()):
548 if isinstance(value, StreamObject):
554 elif isinstance(data, ArrayObject):
555 for i
in range(len(data)):
557 if isinstance(value, StreamObject):
563 elif isinstance(data, IndirectObject):
566 if data.idnum
in self.
stack:
569 self.
stack.append(data.idnum)
574 newobj = externMap.get(data.pdf, {}).get(data.generation, {}).get(data.idnum,
None)
577 newobj = data.pdf.getObject(data)
581 if data.pdf
not in externMap:
582 externMap[data.pdf] = {}
583 if data.generation
not in externMap[data.pdf]:
584 externMap[data.pdf][data.generation] = {}
585 externMap[data.pdf][data.generation][data.idnum] = newobj_ido
597 idnum = self.
_objects.index(obj) + 1
599 assert ref.getObject() == obj
605 idnum = self.
_objects.index(outline) + 1
607 assert outlineRef.getObject() == outline
619 idnum = self.
_objects.index(names) + 1
621 assert namesRef.getObject() == names
622 if '/Dests' in names
and isinstance(names[
'/Dests'], DictionaryObject):
623 dests = names[
'/Dests']
624 idnum = self.
_objects.index(dests) + 1
626 assert destsRef.getObject() == dests
627 if '/Names' in dests:
659 parent = parent.getObject()
661 parent.addChild(destRef, self)
667 for k, v
in list(bookmark.items()):
669 bookmarkObj.update(bookmark)
673 for k, v
in list(bookmark[
'/A'].items()):
685 parent = parent.getObject()
686 parent.addChild(bookmarkRef, self)
690 def addBookmark(self, title, pagenum, parent=None, color=None, bold=False, italic=False, fit='/Fit', *args):
692 Add a bookmark to this PDF file.
694 :param str title: Title to use for this bookmark.
695 :param int pagenum: Page number this bookmark will point to.
696 :param parent: A reference to a parent bookmark to create nested
698 :param tuple color: Color of the bookmark as a red, green, blue tuple
700 :param bool bold: Bookmark is bold
701 :param bool italic: Bookmark is italic
702 :param str fit: The fit of the destination page. See
703 :meth:`addLink()<addLink>` for details.
714 destArray = dest.getDestArray()
733 if color
is not None:
746 parent = parent.getObject()
747 parent.addChild(bookmarkRef, self)
755 nd.extend([dest[
'/Title'], destRef])
770 nd.extend([title, destRef])
776 Removes links and annotations from this output.
781 if "/Annots" in pageRef:
782 del pageRef[
'/Annots']
786 Removes images from this output.
788 :param bool ignoreByteStringObject: optional parameter
789 to ignore ByteString Objects.
792 for j
in range(len(pages)):
795 content = pageRef[
'/Contents'].
getObject()
796 if not isinstance(content, ContentStream):
801 for operands, operator
in content.operations:
802 if operator ==
b_(
'Tj'):
804 if ignoreByteStringObject:
805 if not isinstance(text, TextStringObject):
807 elif operator ==
b_(
"'"):
809 if ignoreByteStringObject:
810 if not isinstance(text, TextStringObject):
812 elif operator ==
b_(
'"'):
814 if ignoreByteStringObject:
815 if not isinstance(text, TextStringObject):
817 elif operator ==
b_(
"TJ"):
818 for i
in range(len(operands[0])):
819 if ignoreByteStringObject:
820 if not isinstance(operands[0][i], TextStringObject):
823 if operator ==
b_(
'q'):
825 if operator ==
b_(
'Q'):
828 if operator
in [
b_(
'cm'),
b_(
'w'),
b_(
'J'),
b_(
'j'),
b_(
'M'),
b_(
'd'),
b_(
'ri'),
b_(
'i'),
829 b_(
'gs'),
b_(
'W'),
b_(
'b'),
b_(
's'),
b_(
'S'),
b_(
'f'),
b_(
'F'),
b_(
'n'),
b_(
'm'),
b_(
'l'),
832 if operator ==
b_(
're'):
834 _operations.append((operands, operator))
836 content.operations = _operations
837 pageRef.__setitem__(
NameObject(
'/Contents'), content)
841 Removes images from this output.
843 :param bool ignoreByteStringObject: optional parameter
844 to ignore ByteString Objects.
847 for j
in range(len(pages)):
850 content = pageRef[
'/Contents'].
getObject()
851 if not isinstance(content, ContentStream):
853 for operands,operator
in content.operations:
854 if operator ==
b_(
'Tj'):
856 if not ignoreByteStringObject:
857 if isinstance(text, TextStringObject):
860 if isinstance(text, TextStringObject)
or \
861 isinstance(text, ByteStringObject):
863 elif operator ==
b_(
"'"):
865 if not ignoreByteStringObject:
866 if isinstance(text, TextStringObject):
869 if isinstance(text, TextStringObject)
or \
870 isinstance(text, ByteStringObject):
872 elif operator ==
b_(
'"'):
874 if not ignoreByteStringObject:
875 if isinstance(text, TextStringObject):
878 if isinstance(text, TextStringObject)
or \
879 isinstance(text, ByteStringObject):
881 elif operator ==
b_(
"TJ"):
882 for i
in range(len(operands[0])):
883 if not ignoreByteStringObject:
884 if isinstance(operands[0][i], TextStringObject):
887 if isinstance(operands[0][i], TextStringObject)
or \
888 isinstance(operands[0][i], ByteStringObject):
891 pageRef.__setitem__(
NameObject(
'/Contents'), content)
893 def addLink(self, pagenum, pagedest, rect, border=None, fit='/Fit', *args):
895 Add an internal link from a rectangular area to the specified page.
897 :param int pagenum: index of the page on which to place the link.
898 :param int pagedest: index of the page to which the link should go.
899 :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
900 integers specifying the clickable rectangular area
901 ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
902 :param border: if provided, an array describing border-drawing
903 properties. See the PDF spec for details. No border will be
904 drawn if this argument is omitted.
905 :param str fit: Page fit or 'zoom' option (see below). Additional arguments may need
906 to be supplied. Passing ``None`` will be read as a null value for that coordinate.
908 Valid zoom arguments (see Table 8.2 of the PDF 1.7 reference for details):
909 /Fit No additional arguments
910 /XYZ [left] [top] [zoomFactor]
913 /FitR [left] [bottom] [right] [top]
914 /FitB No additional arguments
923 if border
is not None:
924 borderArr = [
NameObject(n)
for n
in border[:3]]
927 borderArr.append(dashPattern)
933 elif isinstance(rect, RectangleObject):
945 destArray = dest.getDestArray()
958 if "/Annots" in pageRef:
959 pageRef[
'/Annots'].append(lnkRef)
963 _valid_layouts = [
'/NoLayout',
'/SinglePage',
'/OneColumn',
'/TwoColumnLeft',
'/TwoColumnRight',
'/TwoPageLeft',
'/TwoPageRight']
968 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` for a description of valid layouts.
970 :return: Page layout currently being used.
971 :rtype: str, None if not specified
982 :param str layout: The page layout to be used
985 /NoLayout Layout explicitly not specified
986 /SinglePage Show one page at a time
987 /OneColumn Show one column at a time
988 /TwoColumnLeft Show pages in two columns, odd-numbered pages on the left
989 /TwoColumnRight Show pages in two columns, odd-numbered pages on the right
990 /TwoPageLeft Show two pages at a time, odd-numbered pages on the left
991 /TwoPageRight Show two pages at a time, odd-numbered pages on the right
993 if not isinstance(layout, NameObject):
995 warnings.warn(
"Layout should be one of: {}".format(
', '.join(self.
_valid_layouts)))
999 pageLayout = property(getPageLayout, setPageLayout)
1000 """Read and write property accessing the :meth:`getPageLayout()<PdfFileWriter.getPageLayout>`
1001 and :meth:`setPageLayout()<PdfFileWriter.setPageLayout>` methods."""
1003 _valid_modes = [
'/UseNone',
'/UseOutlines',
'/UseThumbs',
'/FullScreen',
'/UseOC',
'/UseAttachments']
1008 See :meth:`setPageMode()<PdfFileWriter.setPageMode>` for a description
1011 :return: Page mode currently being used.
1012 :rtype: str, None if not specified
1023 :param str mode: The page mode to use.
1026 /UseNone Do not show outlines or thumbnails panels
1027 /UseOutlines Show outlines (aka bookmarks) panel
1028 /UseThumbs Show page thumbnails panel
1029 /FullScreen Fullscreen view
1030 /UseOC Show Optional Content Group (OCG) panel
1031 /UseAttachments Show attachments panel
1033 if not isinstance(mode, NameObject):
1035 warnings.warn(
"Mode should be one of: {}".format(
', '.join(self.
_valid_modes)))
1039 pageMode = property(getPageMode, setPageMode)
1040 """Read and write property accessing the :meth:`getPageMode()<PdfFileWriter.getPageMode>`
1041 and :meth:`setPageMode()<PdfFileWriter.setPageMode>` methods."""
1046 Initializes a PdfFileReader object. This operation can take some time, as
1047 the PDF stream's cross-reference tables are read into memory.
1049 :param stream: A File object or an object that supports the standard read
1050 and seek methods similar to a File object. Could also be a
1051 string representing a path to a PDF file.
1052 :param bool strict: Determines whether user should be warned of all
1053 problems and also causes some correctable problems to be fatal.
1054 Defaults to ``True``.
1055 :param warndest: Destination for logging warnings (defaults to
1057 :param bool overwriteWarnings: Determines whether to override Python's
1058 ``warnings.py`` module with a custom implementation (defaults to
1061 def __init__(self, stream, strict=True, warndest = None, overwriteWarnings = True):
1062 if overwriteWarnings:
1065 def _showwarning(message, category, filename, lineno, file=warndest, line=None):
1069 file.write(
formatWarning(message, category, filename, lineno, line))
1072 warnings.showwarning = _showwarning
1078 if hasattr(stream,
'mode')
and 'b' not in stream.mode:
1079 warnings.warn(
"PdfFileReader stream/file object is not in binary mode. It may not be read correctly.",
utils.PdfReadWarning)
1081 fileobj = open(stream,
'rb')
1091 Retrieves the PDF file's document information dictionary, if it exists.
1092 Note that some PDF files use metadata streams instead of docinfo
1093 dictionaries, and these metadata streams will not be accessed by this
1096 :return: the document information of this PDF file
1097 :rtype: :class:`DocumentInformation<pdf.DocumentInformation>` or ``None`` if none exists.
1099 if "/Info" not in self.
trailer:
1107 """Read-only property that accesses the :meth:`getDocumentInfo()<PdfFileReader.getDocumentInfo>` function."""
1111 Retrieves XMP (Extensible Metadata Platform) data from the PDF document
1114 :return: a :class:`XmpInformation<xmp.XmpInformation>`
1115 instance that can be used to access XMP metadata from the document.
1116 :rtype: :class:`XmpInformation<xmp.XmpInformation>` or
1117 ``None`` if no metadata was found on the document root.
1127 Read-only property that accesses the
1128 :meth:`getXmpMetadata()<PdfFileReader.getXmpMetadata>` function.
1133 Calculates the number of pages in this PDF file.
1135 :return: number of pages
1137 :raises PdfReadError: if file is encrypted and restrictions prevent
1148 return self.
trailer[
"/Root"][
"/Pages"][
"/Count"]
1160 Read-only property that accesses the
1161 :meth:`getNumPages()<PdfFileReader.getNumPages>` function.
1166 Retrieves a page by number from this PDF file.
1168 :param int pageNumber: The page number to retrieve
1169 (pages begin at zero)
1170 :return: a :class:`PageObject<pdf.PageObject>` instance.
1171 :rtype: :class:`PageObject<pdf.PageObject>`
1179 namedDestinations = property(
lambda self:
1182 Read-only property that accesses the
1183 :meth:`getNamedDestinations()<PdfFileReader.getNamedDestinations>` function.
1189 def getFields(self, tree = None, retval = None, fileobj = None):
1191 Extracts field data if this PDF contains interactive form fields.
1192 The *tree* and *retval* parameters are for recursive use.
1194 :param fileobj: A file object (usually a text file) to write
1195 a report to on all interactive form fields found.
1196 :return: A dictionary where each key is a field name, and each
1197 value is a :class:`Field<PyPDF2.generic.Field>` object. By
1198 default, the mapping name is used for keys.
1199 :rtype: dict, or ``None`` if form data could not be located.
1201 fieldAttributes = {
"/FT" :
"Field Type",
"/Parent" :
"Parent",
1202 "/T" :
"Field Name",
"/TU" :
"Alternate Field Name",
1203 "/TM" :
"Mapping Name",
"/Ff" :
"Field Flags",
1204 "/V" :
"Value",
"/DV" :
"Default Value"}
1207 catalog = self.
trailer[
"/Root"]
1209 if "/AcroForm" in catalog:
1210 tree = catalog[
"/AcroForm"]
1217 for attr
in fieldAttributes:
1220 self.
_buildField(tree, retval, fileobj, fieldAttributes)
1223 if "/Fields" in tree:
1224 fields = tree[
"/Fields"]
1226 field = f.getObject()
1227 self.
_buildField(field, retval, fileobj, fieldAttributes)
1231 def _buildField(self, field, retval, fileobj, fieldAttributes):
1244 retval[key] =
Field(field)
1246 def _checkKids(self, tree, retval, fileobj):
1249 for kid
in tree[
"/Kids"]:
1250 self.
getFields(kid.getObject(), retval, fileobj)
1252 def _writeField(self, fileobj, field, fieldAttributes):
1253 order = [
"/TM",
"/T",
"/FT",
"/Parent",
"/TU",
"/Ff",
"/V",
"/DV"]
1255 attrName = fieldAttributes[attr]
1259 types = {
"/Btn":
"Button",
"/Tx":
"Text",
"/Ch":
"Choice",
1261 if field[attr]
in types:
1262 fileobj.write(attrName +
": " + types[field[attr]] +
"\n")
1263 elif attr ==
"/Parent":
1266 name = field[
"/Parent"][
"/TM"]
1268 name = field[
"/Parent"][
"/T"]
1269 fileobj.write(attrName +
": " + name +
"\n")
1271 fileobj.write(attrName +
": " + str(field[attr]) +
"\n")
1277 ''' Retrieves form fields from the document with textual data (inputs, dropdowns)
1282 (formfields[field][
'/T'], formfields[field].get(
'/V'))
for field
in formfields \
1283 if formfields[field].get(
'/FT') ==
'/Tx'
1288 Retrieves the named destinations present in the document.
1290 :return: a dictionary which maps names to
1291 :class:`Destinations<PyPDF2.generic.Destination>`.
1296 catalog = self.
trailer[
"/Root"]
1299 if "/Dests" in catalog:
1300 tree = catalog[
"/Dests"]
1301 elif "/Names" in catalog:
1302 names = catalog[
'/Names']
1303 if "/Dests" in names:
1304 tree = names[
'/Dests']
1311 for kid
in tree[
"/Kids"]:
1314 if "/Names" in tree:
1315 names = tree[
"/Names"]
1316 for i
in range(0, len(names), 2):
1319 if isinstance(val, DictionaryObject)
and '/D' in val:
1329 Read-only property that accesses the
1330 :meth:`getOutlines()<PdfFileReader.getOutlines>` function.
1335 Retrieves the document outline present in the document.
1337 :return: a nested list of :class:`Destinations<PyPDF2.generic.Destination>`.
1339 if outlines ==
None:
1341 catalog = self.
trailer[
"/Root"]
1344 if "/Outlines" in catalog:
1346 lines = catalog[
"/Outlines"]
1353 if "/First" in lines:
1354 node = lines[
"/First"]
1364 outlines.append(outline)
1367 if "/First" in node:
1371 outlines.append(subOutlines)
1373 if "/Next" not in node:
1375 node = node[
"/Next"]
1379 def _getPageNumberByIndirect(self, indirectRef):
1380 """Generate _pageId2Num"""
1383 for i, x
in enumerate(self.
pages):
1384 id2num[x.indirectRef.idnum] = i
1387 if isinstance(indirectRef, int):
1390 idnum = indirectRef.idnum
1397 Retrieve page number of a given PageObject
1399 :param PageObject page: The page to get page number. Should be
1400 an instance of :class:`PageObject<PyPDF2.pdf.PageObject>`
1401 :return: the page number or -1 if page not found
1404 indirectRef = page.indirectRef
1410 Retrieve page number of a given Destination object
1412 :param Destination destination: The destination to get page number.
1413 Should be an instance of
1414 :class:`Destination<PyPDF2.pdf.Destination>`
1415 :return: the page number or -1 if page not found
1418 indirectRef = destination.page
1422 def _buildDestination(self, title, array):
1423 page, typ = array[0:2]
1427 def _buildOutline(self, node):
1428 dest, title, outline =
None,
None,
None
1430 if "/A" in node
and "/Title" in node:
1432 title = node[
"/Title"]
1434 if action[
"/S"] ==
"/GoTo":
1436 elif "/Dest" in node
and "/Title" in node:
1438 title = node[
"/Title"]
1439 dest = node[
"/Dest"]
1443 if isinstance(dest, ArrayObject):
1455 Read-only property that emulates a list based upon the
1456 :meth:`getNumPages()<PdfFileReader.getNumPages>` and
1457 :meth:`getPage()<PdfFileReader.getPage>` methods.
1462 Get the page layout.
1463 See :meth:`setPageLayout()<PdfFileWriter.setPageLayout>`
1464 for a description of valid layouts.
1466 :return: Page layout currently being used.
1467 :rtype: ``str``, ``None`` if not specified
1470 return self.
trailer[
'/Root'][
'/PageLayout']
1474 pageLayout = property(getPageLayout)
1475 """Read-only property accessing the
1476 :meth:`getPageLayout()<PdfFileReader.getPageLayout>` method."""
1481 See :meth:`setPageMode()<PdfFileWriter.setPageMode>`
1482 for a description of valid modes.
1484 :return: Page mode currently being used.
1485 :rtype: ``str``, ``None`` if not specified
1488 return self.
trailer[
'/Root'][
'/PageMode']
1492 pageMode = property(getPageMode)
1493 """Read-only property accessing the
1494 :meth:`getPageMode()<PdfFileReader.getPageMode>` method."""
1496 def _flatten(self, pages=None, inherit=None, indirectRef=None):
1497 inheritablePageAttributes = (
1509 if "/Type" in pages:
1513 for attr
in inheritablePageAttributes:
1515 inherit[attr] = pages[attr]
1516 for page
in pages[
"/Kids"]:
1518 if isinstance(page, IndirectObject):
1519 addt[
"indirectRef"] = page
1520 self.
_flatten(page.getObject(), inherit, **addt)
1522 for attr, value
in list(inherit.items()):
1525 if attr
not in pages:
1528 pageObj.update(pages)
1531 def _getObjectFromStream(self, indirectReference):
1535 stmnum, idx = self.
xref_objStm[indirectReference.idnum]
1536 if debug: print((
"Here1: %s %s"%(stmnum, idx)))
1538 if debug: print((
"Here2: objStm=%s.. stmnum=%s data=%s"%(objStm, stmnum, objStm.getData())))
1540 assert objStm[
'/Type'] ==
'/ObjStm'
1542 assert idx < objStm[
'/N']
1543 streamData =
BytesIO(
b_(objStm.getData()))
1544 for i
in range(objStm[
'/N']):
1546 streamData.seek(-1, 1)
1547 objnum = NumberObject.readFromStream(streamData)
1549 streamData.seek(-1, 1)
1550 offset = NumberObject.readFromStream(streamData)
1552 streamData.seek(-1, 1)
1553 if objnum != indirectReference.idnum:
1556 if self.
strict and idx != i:
1558 streamData.seek(objStm[
'/First']+offset, 0)
1560 pos = streamData.tell()
1561 streamData.seek(0, 0)
1562 lines = streamData.readlines()
1563 for i
in range(0, len(lines)):
1565 streamData.seek(pos, 0)
1571 e = sys.exc_info()[1]
1572 warnings.warn(
"Invalid stream (index %d) within object %d %d: %s" % \
1586 if debug: print((
"looking at:", indirectReference.idnum, indirectReference.generation))
1588 indirectReference.idnum)
1591 if indirectReference.generation == 0
and \
1594 elif indirectReference.generation
in self.
xref and \
1595 indirectReference.idnum
in self.
xref[indirectReference.generation]:
1596 start = self.
xref[indirectReference.generation][indirectReference.idnum]
1597 if debug: print((
" Uncompressed Object", indirectReference.idnum, indirectReference.generation,
":", start))
1598 self.
stream.seek(start, 0)
1600 if idnum != indirectReference.idnum
and self.
xrefIndex:
1603 raise utils.PdfReadError(
"Expected object ID (%d %d) does not match actual (%d %d); xref table not zero-indexed." \
1604 % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1606 elif idnum != indirectReference.idnum:
1608 raise utils.PdfReadError(
"Expected object ID (%d %d) does not match actual (%d %d)." \
1609 % (indirectReference.idnum, indirectReference.generation, idnum, generation))
1610 assert generation == indirectReference.generation
1616 if not hasattr(self,
'_decryption_key'):
1620 pack1 = struct.pack(
"<i", indirectReference.idnum)[:3]
1621 pack2 = struct.pack(
"<i", indirectReference.generation)[:2]
1624 md5_hash = md5(key).digest()
1628 warnings.warn(
"Object %d %d not defined."%(indirectReference.idnum,
1633 indirectReference.idnum, retval)
1636 def _decryptObject(self, obj, key):
1637 if isinstance(obj, ByteStringObject)
or isinstance(obj, TextStringObject):
1639 elif isinstance(obj, StreamObject):
1641 elif isinstance(obj, DictionaryObject):
1642 for dictkey, value
in list(obj.items()):
1644 elif isinstance(obj, ArrayObject):
1645 for i
in range(len(obj)):
1660 obj = stream.read(3)
1663 if (extra
and self.
strict):
1665 warnings.warn(
"Superfluous whitespace found in object header %s %s" % \
1667 return int(idnum), int(generation)
1672 if debug
and out: print((
"cache hit: %d %d"%(idnum, generation)))
1673 elif debug: print((
"cache miss: %d %d"%(idnum, generation)))
1679 msg =
"Overwriting cache for %s %s"%(generation, idnum)
1681 else: warnings.warn(msg)
1687 if debug: print(
">>read", stream)
1690 if not stream.tell():
1692 last1K = stream.tell() - 1024 + 1
1694 while line[:5] !=
b_(
"%%EOF"):
1695 if stream.tell() < last1K:
1698 if debug: print(
" line:",line)
1703 startxref = int(line)
1706 if not line.startswith(
b_(
"startxref")):
1708 startxref = int(line[9:].strip())
1709 warnings.warn(
"startxref on same line as offset")
1712 if line[:9] !=
b_(
"startxref"):
1721 stream.seek(startxref, 0)
1725 ref = stream.read(4)
1726 if ref[:3] !=
b_(
"ref"):
1733 if firsttime
and num != 0:
1736 warnings.warn(
"Xref table not zero-indexed. ID numbers for objects will be corrected.",
utils.PdfReadWarning)
1747 line = stream.read(20)
1755 while line[0]
in b_(
"\x0D\x0A"):
1756 stream.seek(-20 + 1, 1)
1757 line = stream.read(20)
1765 if line[-1]
in b_(
"0123456789t"):
1768 offset, generation = line[:16].split(
b_(
" "))
1769 offset, generation = int(offset), int(generation)
1770 if generation
not in self.
xref:
1771 self.
xref[generation] = {}
1772 if num
in self.
xref[generation]:
1779 self.
xref[generation][num] = offset
1784 trailertag = stream.read(7)
1785 if trailertag !=
b_(
"trailer"):
1793 for key, value
in list(newTrailer.items()):
1796 if "/Prev" in newTrailer:
1797 startxref = newTrailer[
"/Prev"]
1805 assert xrefstream[
"/Type"] ==
"/XRef"
1807 streamData =
BytesIO(
b_(xrefstream.getData()))
1810 idx_pairs = xrefstream.get(
"/Index", [0, xrefstream.get(
"/Size")])
1811 if debug: print((
"read idx_pairs=%s"%list(self.
_pairs(idx_pairs))))
1812 entrySizes = xrefstream.get(
"/W")
1813 assert len(entrySizes) >= 3
1814 if self.
strict and len(entrySizes) > 3:
1820 if entrySizes[i] > 0:
1821 d = streamData.read(entrySizes[i])
1829 def used_before(num, generation):
1831 return num
in self.
xref.get(generation, [])
or \
1836 for start, size
in self.
_pairs(idx_pairs):
1838 assert start >= last_end
1839 last_end = start + size
1840 for num
in range(start, start+size):
1842 xref_type = getEntry(0)
1846 next_free_object = getEntry(1)
1847 next_generation = getEntry(2)
1848 elif xref_type == 1:
1850 byte_offset = getEntry(1)
1851 generation = getEntry(2)
1852 if generation
not in self.
xref:
1853 self.
xref[generation] = {}
1854 if not used_before(num, generation):
1855 self.
xref[generation][num] = byte_offset
1856 if debug: print((
"XREF Uncompressed: %s %s"%(
1858 elif xref_type == 2:
1860 objstr_num = getEntry(1)
1861 obstr_idx = getEntry(2)
1863 if not used_before(num, generation):
1864 if debug: print((
"XREF Compressed: %s %s %s"%(
1865 num, objstr_num, obstr_idx)))
1871 trailerKeys =
"/Root",
"/Encrypt",
"/Info",
"/ID"
1872 for key
in trailerKeys:
1873 if key
in xrefstream
and key
not in self.
trailer:
1875 if "/Prev" in xrefstream:
1876 startxref = xrefstream[
"/Prev"]
1884 tmp = stream.read(20)
1885 xref_loc = tmp.find(
b_(
"xref"))
1887 startxref -= (10 - xref_loc)
1890 stream.seek(startxref, 0)
1892 for look
in range(5):
1893 if stream.read(1).isdigit():
1905 for gen
in self.
xref:
1906 if gen == 65535:
continue
1907 for id
in self.
xref[gen]:
1908 stream.seek(self.
xref[gen][id], 0)
1919 def _zeroXref(self, generation):
1920 self.
xref[generation] = dict( (k-self.
xrefIndex, v)
for (k, v)
in list(self.
xref[generation].items()) )
1922 def _pairs(self, array):
1925 yield array[i], array[i+1]
1927 if (i+1) >= len(array):
1932 if debug: print(
">>readNextEndLine")
1936 if stream.tell() == 0:
1939 if debug: print((
" x:", x,
"%x"%ord(x)))
1940 if stream.tell() < 2:
1943 if x ==
b_(
'\n')
or x ==
b_(
'\r'):
1945 while x ==
b_(
'\n')
or x ==
b_(
'\r'):
1947 if ord(x) == 0x0D: print(
" x is CR 0D")
1948 elif ord(x) == 0x0A: print(
" x is LF 0A")
1950 if x ==
b_(
'\n')
or x ==
b_(
'\r'):
1953 if stream.tell() < 2:
1956 stream.seek(2
if crlf
else 1, 1)
1959 if debug: print(
" x is neither")
1961 if debug: print((
" RNEL line:", line))
1962 if debug: print(
"leaving RNEL")
1967 When using an encrypted / secured PDF file with the PDF Standard
1968 encryption handler, this function will allow the file to be decrypted.
1969 It checks the given password against the document's user password and
1970 owner password, and then stores the resulting decryption key if either
1971 password is correct.
1973 It does not matter which password was matched. Both passwords provide
1974 the correct decryption key that will allow the document to be used with
1977 :param str password: The password to match.
1978 :return: ``0`` if the password failed, ``1`` if the password matched the user
1979 password, and ``2`` if the password matched the owner password.
1981 :raises NotImplementedError: if document uses an unsupported encryption
1991 def _decrypt(self, password):
1993 if encrypt[
'/Filter'] !=
'/Standard':
1994 raise NotImplementedError(
"only Standard PDF encryption handler is available")
1995 if not (encrypt[
'/V']
in (1, 2)):
1996 raise NotImplementedError(
"only algorithm code 1 and 2 are supported")
2006 keylen = encrypt[
'/Length'].
getObject() // 8
2007 key = _alg33_1(password, rev, keylen)
2013 for i
in range(19, -1, -1):
2015 for l
in range(len(key)):
2025 def _authenticateUserPassword(self, password):
2032 real_U = encrypt[
'/U'].
getObject().original_bytes
2034 U, key = _alg34(password, owner_entry, p_entry, id1_entry)
2036 U, key = _alg35(password, rev,
2037 encrypt[
"/Length"].
getObject() // 8, owner_entry,
2040 U, real_U = U[:16], real_U[:16]
2041 return U == real_U, key
2044 return "/Encrypt" in self.
trailer
2048 Read-only boolean property showing whether this PDF file is encrypted.
2049 Note that this property, if true, will remain true even after the
2050 :meth:`decrypt()<PdfFileReader.decrypt>` method is called.
2055 retval = self.get(name)
2056 if isinstance(retval, RectangleObject):
2060 retval = self.get(d)
2063 if isinstance(retval, IndirectObject):
2064 retval = self.pdf.getObject(retval)
2071 if not isinstance(name, NameObject):
2091 This class represents a single page within a PDF file. Typically this
2092 object will be created by accessing the
2093 :meth:`getPage()<PyPDF2.PdfFileReader.getPage>` method of the
2094 :class:`PdfFileReader<PyPDF2.PdfFileReader>` class, but it is
2095 also possible to create an empty page with the
2096 :meth:`createBlankPage()<PageObject.createBlankPage>` static method.
2098 :param pdf: PDF file the page belongs to.
2099 :param indirectRef: Stores the original indirect reference to
2100 this object in its source PDF
2103 DictionaryObject.__init__(self)
2109 Returns a new blank page.
2110 If ``width`` or ``height`` is ``None``, try to get the page size
2111 from the last page of *pdf*.
2113 :param pdf: PDF file the page belongs to
2114 :param float width: The width of the new page expressed in default user
2116 :param float height: The height of the new page expressed in default user
2118 :return: the new blank page:
2119 :rtype: :class:`PageObject<PageObject>`
2120 :raises PageSizeNotDefinedError: if ``pdf`` is ``None`` or contains
2129 if width
is None or height
is None:
2130 if pdf
is not None and pdf.getNumPages() > 0:
2131 lastpage = pdf.getPage(pdf.getNumPages() - 1)
2132 width = lastpage.mediaBox.getWidth()
2133 height = lastpage.mediaBox.getHeight()
2140 createBlankPage = staticmethod(createBlankPage)
2144 Rotates a page clockwise by increments of 90 degrees.
2146 :param int angle: Angle to rotate the page. Must be an increment
2149 assert angle % 90 == 0
2155 Rotates a page counter-clockwise by increments of 90 degrees.
2157 :param int angle: Angle to rotate the page. Must be an increment
2160 assert angle % 90 == 0
2164 def _rotate(self, angle):
2165 currentAngle = self.get(
"/Rotate", 0)
2168 def _mergeResources(res1, res2, resource):
2173 for key
in list(page2Res.keys()):
2174 if key
in newRes
and newRes.raw_get(key) != page2Res.raw_get(key):
2175 newname =
NameObject(key + str(uuid.uuid4()))
2176 renameRes[key] = newname
2177 newRes[newname] = page2Res[key]
2178 elif key
not in newRes:
2179 newRes[key] = page2Res.raw_get(key)
2180 return newRes, renameRes
2181 _mergeResources = staticmethod(_mergeResources)
2183 def _contentStreamRename(stream, rename, pdf):
2187 for operands, operator
in stream.operations:
2188 for i
in range(len(operands)):
2190 if isinstance(op, NameObject):
2191 operands[i] = rename.get(op,op)
2193 _contentStreamRename = staticmethod(_contentStreamRename)
2195 def _pushPopGS(contents, pdf):
2200 stream.operations.insert(0, [[],
"q"])
2201 stream.operations.append([[],
"Q"])
2203 _pushPopGS = staticmethod(_pushPopGS)
2205 def _addTransformationMatrix(contents, pdf, ctm):
2208 a, b, c, d, e, f = ctm
2214 _addTransformationMatrix = staticmethod(_addTransformationMatrix)
2218 Accesses the page contents.
2220 :return: the ``/Contents`` object, or ``None`` if it doesn't exist.
2221 ``/Contents`` is optional, as described in PDF Reference 7.7.3.3
2223 if "/Contents" in self:
2230 Merges the content streams of two pages into one. Resource references
2231 (i.e. fonts) are maintained from both pages. The mediabox/cropbox/etc
2232 of this page are not altered. The parameter page's content stream will
2233 be added to the end of this page's content stream, meaning that it will
2234 be drawn after, or "on top" of this page.
2236 :param PageObject page2: The page to be merged into this one. Should be
2237 an instance of :class:`PageObject<PageObject>`.
2241 def _mergePage(self, page2, page2transformation=None, ctm=None, expand=False):
2248 originalResources = self[
"/Resources"].
getObject()
2249 page2Resources = page2[
"/Resources"].
getObject()
2252 for page
in (self, page2):
2253 if "/Annots" in page:
2254 annots = page[
"/Annots"]
2255 if isinstance(annots, ArrayObject):
2257 newAnnots.append(ref)
2259 for res
in "/ExtGState",
"/Font",
"/XObject",
"/ColorSpace",
"/Pattern",
"/Shading",
"/Properties":
2260 new, newrename = PageObject._mergeResources(originalResources, page2Resources, res)
2263 rename.update(newrename)
2275 if originalContent
is not None:
2276 newContentArray.append(PageObject._pushPopGS(
2277 originalContent, self.
pdf))
2279 page2Content = page2.getContents()
2280 if page2Content
is not None:
2281 if page2transformation
is not None:
2282 page2Content = page2transformation(page2Content)
2283 page2Content = PageObject._contentStreamRename(
2284 page2Content, rename, self.
pdf)
2285 page2Content = PageObject._pushPopGS(page2Content, self.
pdf)
2286 newContentArray.append(page2Content)
2290 corners1 = [self.
mediaBox.getLowerLeft_x().as_numeric(), self.
mediaBox.getLowerLeft_y().as_numeric(),
2291 self.
mediaBox.getUpperRight_x().as_numeric(), self.
mediaBox.getUpperRight_y().as_numeric()]
2292 corners2 = [page2.mediaBox.getLowerLeft_x().as_numeric(), page2.mediaBox.getLowerLeft_y().as_numeric(),
2293 page2.mediaBox.getUpperLeft_x().as_numeric(), page2.mediaBox.getUpperLeft_y().as_numeric(),
2294 page2.mediaBox.getUpperRight_x().as_numeric(), page2.mediaBox.getUpperRight_y().as_numeric(),
2295 page2.mediaBox.getLowerRight_x().as_numeric(), page2.mediaBox.getLowerRight_y().as_numeric()]
2297 ctm = [float(x)
for x
in ctm]
2298 new_x = [ctm[0]*corners2[i] + ctm[2]*corners2[i+1] + ctm[4]
for i
in range(0, 8, 2)]
2299 new_y = [ctm[1]*corners2[i] + ctm[3]*corners2[i+1] + ctm[5]
for i
in range(0, 8, 2)]
2301 new_x = corners2[0:8:2]
2302 new_y = corners2[1:8:2]
2303 lowerleft = [min(new_x), min(new_y)]
2304 upperright = [max(new_x), max(new_y)]
2305 lowerleft = [min(corners1[0], lowerleft[0]), min(corners1[1], lowerleft[1])]
2306 upperright = [max(corners1[2], upperright[0]), max(corners1[3], upperright[1])]
2308 self.
mediaBox.setLowerLeft(lowerleft)
2309 self.
mediaBox.setUpperRight(upperright)
2312 self[
NameObject(
'/Resources')] = newResources
2317 This is similar to mergePage, but a transformation matrix is
2318 applied to the merged stream.
2320 :param PageObject page2: The page to be merged into this one. Should be
2321 an instance of :class:`PageObject<PageObject>`.
2322 :param tuple ctm: a 6-element tuple containing the operands of the
2323 transformation matrix
2324 :param bool expand: Whether the page should be expanded to fit the dimensions
2325 of the page to be merged.
2328 PageObject._addTransformationMatrix(page2Content, page2.pdf, ctm), ctm, expand)
2332 This is similar to mergePage, but the stream to be merged is scaled
2333 by appling a transformation matrix.
2335 :param PageObject page2: The page to be merged into this one. Should be
2336 an instance of :class:`PageObject<PageObject>`.
2337 :param float scale: The scaling factor
2338 :param bool expand: Whether the page should be expanded to fit the
2339 dimensions of the page to be merged.
2348 This is similar to mergePage, but the stream to be merged is rotated
2349 by appling a transformation matrix.
2351 :param PageObject page2: the page to be merged into this one. Should be
2352 an instance of :class:`PageObject<PageObject>`.
2353 :param float rotation: The angle of the rotation, in degrees
2354 :param bool expand: Whether the page should be expanded to fit the
2355 dimensions of the page to be merged.
2357 rotation = math.radians(rotation)
2359 [math.cos(rotation), math.sin(rotation),
2360 -math.sin(rotation), math.cos(rotation),
2365 This is similar to mergePage, but the stream to be merged is translated
2366 by appling a transformation matrix.
2368 :param PageObject page2: the page to be merged into this one. Should be
2369 an instance of :class:`PageObject<PageObject>`.
2370 :param float tx: The translation on X axis
2371 :param float ty: The translation on Y axis
2372 :param bool expand: Whether the page should be expanded to fit the
2373 dimensions of the page to be merged.
2381 This is similar to mergePage, but the stream to be merged is rotated
2382 and translated by appling a transformation matrix.
2384 :param PageObject page2: the page to be merged into this one. Should be
2385 an instance of :class:`PageObject<PageObject>`.
2386 :param float tx: The translation on X axis
2387 :param float ty: The translation on Y axis
2388 :param float rotation: The angle of the rotation, in degrees
2389 :param bool expand: Whether the page should be expanded to fit the
2390 dimensions of the page to be merged.
2393 translation = [[1, 0, 0],
2396 rotation = math.radians(rotation)
2397 rotating = [[math.cos(rotation), math.sin(rotation), 0],
2398 [-math.sin(rotation), math.cos(rotation), 0],
2400 rtranslation = [[1, 0, 0],
2407 ctm[1][0], ctm[1][1],
2408 ctm[2][0], ctm[2][1]], expand)
2412 This is similar to mergePage, but the stream to be merged is rotated
2413 and scaled by appling a transformation matrix.
2415 :param PageObject page2: the page to be merged into this one. Should be
2416 an instance of :class:`PageObject<PageObject>`.
2417 :param float rotation: The angle of the rotation, in degrees
2418 :param float scale: The scaling factor
2419 :param bool expand: Whether the page should be expanded to fit the
2420 dimensions of the page to be merged.
2422 rotation = math.radians(rotation)
2423 rotating = [[math.cos(rotation), math.sin(rotation), 0],
2424 [-math.sin(rotation), math.cos(rotation), 0],
2426 scaling = [[scale, 0, 0],
2432 [ctm[0][0], ctm[0][1],
2433 ctm[1][0], ctm[1][1],
2434 ctm[2][0], ctm[2][1]], expand)
2438 This is similar to mergePage, but the stream to be merged is translated
2439 and scaled by appling a transformation matrix.
2441 :param PageObject page2: the page to be merged into this one. Should be
2442 an instance of :class:`PageObject<PageObject>`.
2443 :param float scale: The scaling factor
2444 :param float tx: The translation on X axis
2445 :param float ty: The translation on Y axis
2446 :param bool expand: Whether the page should be expanded to fit the
2447 dimensions of the page to be merged.
2450 translation = [[1, 0, 0],
2453 scaling = [[scale, 0, 0],
2459 ctm[1][0], ctm[1][1],
2460 ctm[2][0], ctm[2][1]], expand)
2464 This is similar to mergePage, but the stream to be merged is translated,
2465 rotated and scaled by appling a transformation matrix.
2467 :param PageObject page2: the page to be merged into this one. Should be
2468 an instance of :class:`PageObject<PageObject>`.
2469 :param float tx: The translation on X axis
2470 :param float ty: The translation on Y axis
2471 :param float rotation: The angle of the rotation, in degrees
2472 :param float scale: The scaling factor
2473 :param bool expand: Whether the page should be expanded to fit the
2474 dimensions of the page to be merged.
2476 translation = [[1, 0, 0],
2479 rotation = math.radians(rotation)
2480 rotating = [[math.cos(rotation), math.sin(rotation), 0],
2481 [-math.sin(rotation), math.cos(rotation), 0],
2483 scaling = [[scale, 0, 0],
2490 ctm[1][0], ctm[1][1],
2491 ctm[2][0], ctm[2][1]], expand)
2500 Applies a transformation matrix to the page.
2502 :param tuple ctm: A 6-element tuple containing the operands of the
2503 transformation matrix.
2506 if originalContent
is not None:
2507 newContent = PageObject._addTransformationMatrix(
2508 originalContent, self.
pdf, ctm)
2509 newContent = PageObject._pushPopGS(newContent, self.
pdf)
2514 Scales a page by the given factors by appling a transformation
2515 matrix to its content and updating the page size.
2517 :param float sx: The scaling factor on horizontal axis.
2518 :param float sy: The scaling factor on vertical axis.
2524 float(self.
mediaBox.getLowerLeft_x()) * sx,
2525 float(self.
mediaBox.getLowerLeft_y()) * sy,
2526 float(self.
mediaBox.getUpperRight_x()) * sx,
2527 float(self.
mediaBox.getUpperRight_y()) * sy])
2529 viewport = self[
"/VP"]
2530 if isinstance(viewport, ArrayObject):
2531 bbox = viewport[0][
"/BBox"]
2533 bbox = viewport[
"/BBox"]
2535 float(bbox[0]) * sx,
2536 float(bbox[1]) * sy,
2537 float(bbox[2]) * sx,
2538 float(bbox[3]) * sy])
2539 if isinstance(viewport, ArrayObject):
2546 Scales a page by the given factor by appling a transformation
2547 matrix to its content and updating the page size.
2549 :param float factor: The scaling factor (for both X and Y axis).
2551 self.
scale(factor, factor)
2555 Scales a page to the specified dimentions by appling a
2556 transformation matrix to its content and updating the page size.
2558 :param float width: The new width.
2559 :param float height: The new heigth.
2561 sx = width / float(self.
mediaBox.getUpperRight_x() -
2563 sy = height / float(self.
mediaBox.getUpperRight_y() -
2569 Compresses the size of this page by joining all content streams and
2570 applying a FlateDecode filter.
2572 However, it is possible that this function will perform no action if
2573 content stream compression becomes "automatic" for some reason.
2576 if content
is not None:
2577 if not isinstance(content, ContentStream):
2579 self[
NameObject(
"/Contents")] = content.flateEncode()
2583 Locate all text drawing commands, in the order they are provided in the
2584 content stream, and extract the text. This works well for some PDF
2585 files, but poorly for others, depending on the generator used. This will
2586 be refined in the future. Do not rely on the order of text coming out of
2587 this function, as it will change if this function is made more
2590 :return: a unicode string object.
2594 if not isinstance(content, ContentStream):
2599 for operands, operator
in content.operations:
2600 if operator ==
b_(
"Tj"):
2602 if isinstance(_text, TextStringObject):
2604 elif operator ==
b_(
"T*"):
2606 elif operator ==
b_(
"'"):
2609 if isinstance(_text, TextStringObject):
2611 elif operator ==
b_(
'"'):
2613 if isinstance(_text, TextStringObject):
2616 elif operator ==
b_(
"TJ"):
2617 for i
in operands[0]:
2618 if isinstance(i, TextStringObject):
2625 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2626 defining the boundaries of the physical medium on which the page is
2627 intended to be displayed or printed.
2632 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2633 defining the visible region of default user space. When the page is
2634 displayed or printed, its contents are to be clipped (cropped) to this
2635 rectangle and then imposed on the output medium in some
2636 implementation-defined manner. Default value: same as :attr:`mediaBox<mediaBox>`.
2641 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2642 defining the region to which the contents of the page should be clipped
2643 when output in a production enviroment.
2648 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2649 defining the intended dimensions of the finished page after trimming.
2654 A :class:`RectangleObject<PyPDF2.generic.RectangleObject>`, expressed in default user space units,
2655 defining the extent of the page's meaningful content as intended by the
2666 stream = stream.getObject()
2667 if isinstance(stream, ArrayObject):
2670 data += s.getObject().
getData()
2676 def __parseContentStream(self, stream):
2682 if peek ==
b_(
'')
or ord_(peek) == 0:
2685 if peek.isalpha()
or peek ==
b_(
"'")
or peek ==
b_(
'"'):
2687 NameObject.delimiterPattern,
True)
2688 if operator ==
b_(
"BI"):
2691 assert operands == []
2697 elif peek ==
b_(
'%'):
2703 while peek
not in (
b_(
'\r'),
b_(
'\n')):
2704 peek = stream.read(1)
2708 def _readInlineImage(self, stream):
2722 settings[key] = value
2724 tmp = stream.read(3)
2725 assert tmp[:2] ==
b_(
"ID")
2729 tok = stream.read(1)
2732 tok2 = stream.read(1)
2735 tok3 = stream.read(1)
2738 has_q_whitespace =
False
2739 while tok3
in utils.WHITESPACES:
2740 has_q_whitespace =
True
2742 tok3 = stream.read(1)
2743 if tok3 ==
b_(
"Q")
and has_q_whitespace:
2754 return {
"settings": settings,
"data": data}
2759 if operator ==
b_(
"INLINE IMAGE"):
2760 newdata.write(
b_(
"BI"))
2763 newdata.write(dicttext.getvalue()[2:-2])
2764 newdata.write(
b_(
"ID "))
2765 newdata.write(operands[
"data"])
2766 newdata.write(
b_(
"EI"))
2769 op.writeToStream(newdata,
None)
2770 newdata.write(
b_(
" "))
2771 newdata.write(
b_(operator))
2772 newdata.write(
b_(
"\n"))
2773 return newdata.getvalue()
2775 def _setData(self, value):
2778 _data = property(_getData, _setData)
2783 A class representing the basic document metadata provided in a PDF File.
2784 This class is accessible through
2785 :meth:`getDocumentInfo()<PyPDF2.PdfFileReader.getDocumentInfo()>`
2787 All text properties of the document metadata have
2788 *two* properties, eg. author and author_raw. The non-raw property will
2789 always return a ``TextStringObject``, making it ideal for a case where
2790 the metadata is being displayed. The raw property can sometimes return
2791 a ``ByteStringObject``, if PyPDF2 was unable to decode the string's
2792 text encoding; this requires additional safety in the caller and
2793 therefore is not as commonly accessed.
2797 DictionaryObject.__init__(self)
2800 retval = self.get(key,
None)
2801 if isinstance(retval, TextStringObject):
2805 title = property(
lambda self: self.
getText(
"/Title"))
2806 """Read-only property accessing the document's **title**.
2807 Returns a unicode string (``TextStringObject``) or ``None``
2808 if the title is not specified."""
2809 title_raw = property(
lambda self: self.get(
"/Title"))
2810 """The "raw" version of title; can return a ``ByteStringObject``."""
2812 author = property(
lambda self: self.
getText(
"/Author"))
2813 """Read-only property accessing the document's **author**.
2814 Returns a unicode string (``TextStringObject``) or ``None``
2815 if the author is not specified."""
2816 author_raw = property(
lambda self: self.get(
"/Author"))
2817 """The "raw" version of author; can return a ``ByteStringObject``."""
2819 subject = property(
lambda self: self.
getText(
"/Subject"))
2820 """Read-only property accessing the document's **subject**.
2821 Returns a unicode string (``TextStringObject``) or ``None``
2822 if the subject is not specified."""
2823 subject_raw = property(
lambda self: self.get(
"/Subject"))
2824 """The "raw" version of subject; can return a ``ByteStringObject``."""
2826 creator = property(
lambda self: self.
getText(
"/Creator"))
2827 """Read-only property accessing the document's **creator**. If the
2828 document was converted to PDF from another format, this is the name of the
2829 application (e.g. OpenOffice) that created the original document from
2830 which it was converted. Returns a unicode string (``TextStringObject``)
2831 or ``None`` if the creator is not specified."""
2832 creator_raw = property(
lambda self: self.get(
"/Creator"))
2833 """The "raw" version of creator; can return a ``ByteStringObject``."""
2835 producer = property(
lambda self: self.
getText(
"/Producer"))
2836 """Read-only property accessing the document's **producer**.
2837 If the document was converted to PDF from another format, this is
2838 the name of the application (for example, OSX Quartz) that converted
2839 it to PDF. Returns a unicode string (``TextStringObject``)
2840 or ``None`` if the producer is not specified."""
2841 producer_raw = property(
lambda self: self.get(
"/Producer"))
2842 """The "raw" version of producer; can return a ``ByteStringObject``."""
2848 d =
b_(
"\x00\x00\x00\x00\x00\x00\x00\x00") +
b_(d)
2850 return struct.unpack(
">q", d)[0]
2853 _encryption_padding =
b_(
'\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56') + \
2854 b_(
'\xff\xfa\x01\x08\x2e\x2e\x00\xb6\xd0\x68\x3e\x80\x2f\x0c') + \
2855 b_(
'\xa9\xfe\x64\x53\x69\x7a')
2860 def _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt=True):
2866 password =
b_((
str_(password) +
str_(_encryption_padding))[:32])
2873 m.update(owner_entry.original_bytes)
2876 p_entry = struct.pack(
'<i', p_entry)
2880 m.update(id1_entry.original_bytes)
2883 if rev >= 3
and not metadata_encrypt:
2884 m.update(
b_(
"\xff\xff\xff\xff"))
2886 md5_hash = m.digest()
2894 md5_hash = md5(md5_hash[:keylen]).digest()
2899 return md5_hash[:keylen]
2904 def _alg33(owner_pwd, user_pwd, rev, keylen):
2906 key = _alg33_1(owner_pwd, rev, keylen)
2909 user_pwd =
b_((user_pwd +
str_(_encryption_padding))[:32])
2920 for i
in range(1, 20):
2922 for l
in range(len(key)):
2923 new_key += chr(
ord_(key[l]) ^ i)
2931 def _alg33_1(password, rev, keylen):
2935 password =
b_((password +
str_(_encryption_padding))[:32])
2941 md5_hash = m.digest()
2944 md5_hash = md5(md5_hash).digest()
2949 key = md5_hash[:keylen]
2955 def _alg34(password, owner_entry, p_entry, id1_entry):
2958 key = _alg32(password, 2, 5, owner_entry, p_entry, id1_entry)
2970 def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encrypt):
2973 key = _alg32(password, rev, keylen, owner_entry, p_entry, id1_entry)
2977 m.update(_encryption_padding)
2982 m.update(id1_entry.original_bytes)
2983 md5_hash = m.digest()
2993 for i
in range(1, 20):
2995 for l
in range(len(key)):
2996 new_key +=
b_(chr(
ord_(key[l]) ^ i))
3004 return val + (
b_(
'\x00') * 16), key