32 Implementation of generic PDF objects (dictionary, number, string, and so on)
34 __author__ =
"Mathieu Fenniak"
35 __author_email__ =
"biziqe@mathieu.fenniak.net"
38 from .utils
import readNonWhitespace, RC4_encrypt, skipOverComment
39 from .utils
import b_, u_, chr_, ord_
40 from .utils
import PdfStreamError
49 ObjectPrefix =
b_(
'/<[tf(n%')
50 NumberSigns =
b_(
'+-')
51 IndirectPattern = re.compile(
b_(
r"(\d+)\s+(\d+)\s+R[^a-zA-Z]"))
57 idx = ObjectPrefix.find(tok)
60 return NameObject.readFromStream(stream, pdf)
66 return DictionaryObject.readFromStream(stream, pdf)
71 return ArrayObject.readFromStream(stream, pdf)
72 elif idx == 3
or idx == 4:
74 return BooleanObject.readFromStream(stream)
80 return NullObject.readFromStream(stream)
83 while tok
not in (
b_(
'\r'),
b_(
'\n')):
90 if tok
in NumberSigns:
92 return NumberObject.readFromStream(stream)
93 peek = stream.read(20)
94 stream.seek(-len(peek), 1)
95 if IndirectPattern.match(peek) !=
None:
96 return IndirectObject.readFromStream(stream, pdf)
98 return NumberObject.readFromStream(stream)
103 """Resolves indirect references."""
109 stream.write(
b_(
"null"))
112 nulltxt = stream.read(4)
113 if nulltxt !=
b_(
"null"):
116 readFromStream = staticmethod(readFromStream)
125 stream.write(
b_(
"true"))
127 stream.write(
b_(
"false"))
130 word = stream.read(4)
131 if word ==
b_(
"true"):
133 elif word ==
b_(
"fals"):
138 readFromStream = staticmethod(readFromStream)
143 stream.write(
b_(
"["))
145 stream.write(
b_(
" "))
146 data.writeToStream(stream, encryption_key)
147 stream.write(
b_(
" ]"))
161 peekahead = stream.read(1)
162 if peekahead ==
b_(
"]"):
168 readFromStream = staticmethod(readFromStream)
186 isinstance(other, IndirectObject)
and
187 self.
idnum == other.idnum
and
189 self.
pdf is other.pdf
193 return not self.
__eq__(other)
223 readFromStream = staticmethod(readFromStream)
229 return decimal.Decimal.__new__(cls,
utils.str_(value), context)
231 return decimal.Decimal.__new__(cls, str(value))
234 if self == self.to_integral():
235 return str(self.quantize(decimal.Decimal(1)))
240 while o
and o[-1] ==
'0':
245 return float(
b_(repr(self)))
248 stream.write(
b_(repr(self)))
252 NumberPattern = re.compile(
b_(
'[^+-.0-9]'))
258 return int.__new__(cls, val)
259 except OverflowError:
260 return int.__new__(cls, 0)
263 return int(
b_(repr(self)))
266 stream.write(
b_(repr(self)))
270 if num.find(NumberObject.ByteDot) != -1:
274 readFromStream = staticmethod(readFromStream)
281 if isinstance(string, utils.string_type):
283 elif isinstance(string, utils.bytes_type):
285 if string.startswith(codecs.BOM_UTF16_BE):
287 retval.autodetect_utf16 =
True
295 retval.autodetect_pdfdocencoding =
True
297 except UnicodeDecodeError:
300 raise TypeError(
"createStringObject should have str or unicode arg")
316 txt += chr(int(x, base=16))
321 txt += chr(int(x, base=16))
340 elif tok ==
b_(
"\\"):
360 elif tok ==
b_(
"\\"):
362 elif tok
in (
b_(
" "),
b_(
"/"),
b_(
"%"),
b_(
"<"),
b_(
">"),
b_(
"["),
373 ntok = stream.read(1)
378 tok =
b_(chr(int(tok, base=8)))
379 elif tok
in b_(
"\n\r"):
384 if not tok
in b_(
"\n\r"):
405 original_bytes = property(
lambda self: self)
411 stream.write(
b_(
"<"))
413 stream.write(
b_(
">"))
422 autodetect_pdfdocencoding =
False
423 autodetect_utf16 =
False
439 return codecs.BOM_UTF16_BE + self.encode(
"utf-16be")
443 raise Exception(
"no information about original bytes")
451 except UnicodeEncodeError:
452 bytearr = codecs.BOM_UTF16_BE + self.encode(
"utf-16be")
456 obj.writeToStream(stream,
None)
458 stream.write(
b_(
"("))
460 if not chr_(c).isalnum()
and c !=
b_(
' '):
461 stream.write(
b_(
"\\%03o" %
ord_(c)))
464 stream.write(
b_(
")"))
468 delimiterPattern = re.compile(
b_(
r"\s+|[\(\)<>\[\]{}/%]"))
472 stream.write(
b_(self))
476 if debug: print((stream.tell()))
477 name = stream.read(1)
478 if name != NameObject.surfix:
482 if debug: print(name)
485 except (UnicodeEncodeError, UnicodeDecodeError)
as e:
494 readFromStream = staticmethod(readFromStream)
499 return dict.__getitem__(self, key)
502 if not isinstance(key, PdfObject):
503 raise ValueError(
"key must be PdfObject")
504 if not isinstance(value, PdfObject):
505 raise ValueError(
"value must be PdfObject")
506 return dict.__setitem__(self, key, value)
509 if not isinstance(key, PdfObject):
510 raise ValueError(
"key must be PdfObject")
511 if not isinstance(value, PdfObject):
512 raise ValueError(
"value must be PdfObject")
513 return dict.setdefault(self, key, value)
516 return dict.__getitem__(self, key).
getObject()
527 metadata = self.get(
"/Metadata",
None)
530 metadata = metadata.getObject()
545 stream.write(
b_(
"<<\n"))
546 for key, value
in list(self.items()):
547 key.writeToStream(stream, encryption_key)
548 stream.write(
b_(
" "))
549 value.writeToStream(stream, encryption_key)
550 stream.write(
b_(
"\n"))
551 stream.write(
b_(
">>"))
561 if tok ==
b_(
'\x00'):
571 if debug: print((
"Tok:", tok))
580 if not data.get(key):
587 warnings.warn(
"Multiple definitions in dictionary at byte %s for key %s" \
592 if s ==
b_(
's')
and stream.read(5) ==
b_(
'tream'):
596 while eol ==
b_(
' '):
598 assert eol
in (
b_(
"\n"),
b_(
"\r"))
601 if stream.read(1) !=
b_(
'\n'):
604 assert "/Length" in data
605 length = data[
"/Length"]
606 if debug: print(data)
607 if isinstance(length, IndirectObject):
609 length = pdf.getObject(length)
611 data[
"__streamdata__"] = stream.read(length)
612 if debug: print(
"here")
615 ndstream = stream.read(8)
616 if (e + ndstream) !=
b_(
"endstream"):
626 if end ==
b_(
"endstream"):
628 data[
"__streamdata__"] = data[
"__streamdata__"][:-1]
630 if debug: print((
"E", e, ndstream, debugging.toHex(end)))
635 if "__streamdata__" in data:
636 return StreamObject.initializeFromDictionary(data)
641 readFromStream = staticmethod(readFromStream)
646 DictionaryObject.__init__(self)
649 return '/First' in self
658 child = self[
'/First']
661 if child == self[
'/Last']:
663 child = child[
'/Next']
666 childObj = child.getObject()
667 child = pdf.getReference(childObj)
668 assert isinstance(child, IndirectObject)
670 if '/First' not in self:
681 prevRef = pdf.getReference(prev)
682 assert isinstance(prevRef, IndirectObject)
686 parentRef = pdf.getReference(self)
687 assert isinstance(parentRef, IndirectObject)
691 childObj = child.getObject()
694 raise ValueError(
"Removed child does not appear to be a tree item")
696 raise ValueError(
"Removed child is not a member of this tree")
702 cur = curRef.getObject()
704 last = lastRef.getObject()
711 next = nextRef.getObject()
727 next = nextRef.getObject()
744 cur = curRef.getObject()
750 raise ValueError(
"Removal couldn't find item in tree")
760 childObj = child.getObject()
782 DictionaryObject.writeToStream(self, stream, encryption_key)
784 stream.write(
b_(
"\nstream\n"))
789 stream.write(
b_(
"\nendstream"))
792 if "/Filter" in data:
796 retval._data = data[
"__streamdata__"]
797 del data[
"__streamdata__"]
801 initializeFromDictionary = staticmethod(initializeFromDictionary)
804 if "/Filter" in self:
806 if isinstance(f, ArrayObject):
842 for key, value
in list(self.items()):
843 if not key
in (
"/Length",
"/Filter",
"/DecodeParms"):
854 This class is used to represent *page boxes* in PyPDF2. These boxes include:
856 * :attr:`artBox <PyPDF2.pdf.PageObject.artBox>`
857 * :attr:`bleedBox <PyPDF2.pdf.PageObject.bleedBox>`
858 * :attr:`cropBox <PyPDF2.pdf.PageObject.cropBox>`
859 * :attr:`mediaBox <PyPDF2.pdf.PageObject.mediaBox>`
860 * :attr:`trimBox <PyPDF2.pdf.PageObject.trimBox>`
869 if not isinstance(value, (NumberObject, FloatObject)):
874 return "RectangleObject(%s)" % repr(list(self))
930 lowerLeft = property(getLowerLeft, setLowerLeft,
None,
None)
932 Property to read and modify the lower left coordinate of this box
935 lowerRight = property(getLowerRight, setLowerRight,
None,
None)
937 Property to read and modify the lower right coordinate of this box
940 upperLeft = property(getUpperLeft, setUpperLeft,
None,
None)
942 Property to read and modify the upper left coordinate of this box
945 upperRight = property(getUpperRight, setUpperRight,
None,
None)
947 Property to read and modify the upper right coordinate of this box
954 A class representing a field dictionary. This class is accessed through
955 :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
958 DictionaryObject.__init__(self)
959 attributes = (
"/FT",
"/Parent",
"/Kids",
"/T",
"/TU",
"/TM",
"/Ff",
961 for attr
in attributes:
967 fieldType = property(
lambda self: self.get(
"/FT"))
969 Read-only property accessing the type of this field.
972 parent = property(
lambda self: self.get(
"/Parent"))
974 Read-only property accessing the parent of this field.
977 kids = property(
lambda self: self.get(
"/Kids"))
979 Read-only property accessing the kids of this field.
982 name = property(
lambda self: self.get(
"/T"))
984 Read-only property accessing the name of this field.
987 altName = property(
lambda self: self.get(
"/TU"))
989 Read-only property accessing the alternate name of this field.
992 mappingName = property(
lambda self: self.get(
"/TM"))
994 Read-only property accessing the mapping name of this field. This
995 name is used by PyPDF2 as a key in the dictionary returned by
996 :meth:`getFields()<PyPDF2.PdfFileReader.getFields>`
999 flags = property(
lambda self: self.get(
"/Ff"))
1001 Read-only property accessing the field flags, specifying various
1002 characteristics of the field (see Table 8.70 of the PDF 1.7 reference).
1005 value = property(
lambda self: self.get(
"/V"))
1007 Read-only property accessing the value of this field. Format
1008 varies based on field type.
1011 defaultValue = property(
lambda self: self.get(
"/DV"))
1013 Read-only property accessing the default value of this field.
1016 additionalActions = property(
lambda self: self.get(
"/AA"))
1018 Read-only property accessing the additional actions dictionary.
1019 This dictionary defines the field's behavior in response to trigger events.
1020 See Section 8.5.2 of the PDF 1.7 reference.
1026 A class representing a destination within a PDF file.
1027 See section 8.2.1 of the PDF 1.6 reference.
1029 :param str title: Title of this destination.
1030 :param int page: Page number of this destination.
1031 :param str typ: How the destination is displayed.
1032 :param args: Additional arguments may be necessary depending on the type.
1033 :raises PdfReadError: If destination type is invalid.
1035 Valid ``typ`` arguments (see PDF spec for details):
1036 /Fit No additional arguments
1037 /XYZ [left] [top] [zoomFactor]
1040 /FitR [left] [bottom] [right] [top]
1041 /FitB No additional arguments
1046 DictionaryObject.__init__(self)
1055 elif typ ==
"/FitR":
1058 elif typ
in [
"/FitH",
"/FitBH"]:
1060 elif typ
in [
"/FitV",
"/FitBV"]:
1062 elif typ
in [
"/Fit",
"/FitB"]:
1068 return ArrayObject([self.
raw_get(
'/Page'), self[
'/Type']] + [self[x]
for x
in [
'/Left',
'/Bottom',
'/Right',
'/Top',
'/Zoom']
if x
in self])
1071 stream.write(
b_(
"<<\n"))
1073 key.writeToStream(stream, encryption_key)
1074 stream.write(
b_(
" "))
1076 value.writeToStream(stream, encryption_key)
1079 key.writeToStream(stream, encryption_key)
1080 stream.write(
b_(
" "))
1082 value.writeToStream(stream, encryption_key)
1084 stream.write(
b_(
"\n"))
1085 stream.write(
b_(
">>"))
1087 title = property(
lambda self: self.get(
"/Title"))
1089 Read-only property accessing the destination title.
1094 page = property(
lambda self: self.get(
"/Page"))
1096 Read-only property accessing the destination page number.
1101 typ = property(
lambda self: self.get(
"/Type"))
1103 Read-only property accessing the destination type.
1108 zoom = property(
lambda self: self.get(
"/Zoom",
None))
1110 Read-only property accessing the zoom factor.
1112 :rtype: int, or ``None`` if not available.
1115 left = property(
lambda self: self.get(
"/Left",
None))
1117 Read-only property accessing the left horizontal coordinate.
1119 :rtype: int, or ``None`` if not available.
1122 right = property(
lambda self: self.get(
"/Right",
None))
1124 Read-only property accessing the right horizontal coordinate.
1126 :rtype: int, or ``None`` if not available.
1129 top = property(
lambda self: self.get(
"/Top",
None))
1131 Read-only property accessing the top vertical coordinate.
1133 :rtype: int, or ``None`` if not available.
1136 bottom = property(
lambda self: self.get(
"/Bottom",
None))
1138 Read-only property accessing the bottom vertical coordinate.
1140 :rtype: int, or ``None`` if not available.
1146 stream.write(
b_(
"<<\n"))
1147 for key
in [
NameObject(x)
for x
in [
'/Title',
'/Parent',
'/First',
'/Last',
'/Next',
'/Prev']
if x
in self]:
1148 key.writeToStream(stream, encryption_key)
1149 stream.write(
b_(
" "))
1151 value.writeToStream(stream, encryption_key)
1152 stream.write(
b_(
"\n"))
1154 key.writeToStream(stream, encryption_key)
1155 stream.write(
b_(
" "))
1157 value.writeToStream(stream, encryption_key)
1158 stream.write(
b_(
"\n"))
1159 stream.write(
b_(
">>"))
1164 for c
in unicode_string:
1166 retval +=
b_(chr(_pdfDocEncoding_rev[c]))
1168 raise UnicodeEncodeError(
"pdfdocencoding", c, -1, -1,
1169 "does not exist in translation table")
1175 for b
in byte_array:
1176 c = _pdfDocEncoding[
ord_(b)]
1177 if c ==
u_(
'\u0000'):
1178 raise UnicodeDecodeError(
"pdfdocencoding",
utils.barray(b), -1, -1,
1179 "does not exist in translation table")
1184 u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
1185 u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
1186 u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
u_(
'\u0000'),
1187 u_(
'\u02d8'),
u_(
'\u02c7'),
u_(
'\u02c6'),
u_(
'\u02d9'),
u_(
'\u02dd'),
u_(
'\u02db'),
u_(
'\u02da'),
u_(
'\u02dc'),
1188 u_(
'\u0020'),
u_(
'\u0021'),
u_(
'\u0022'),
u_(
'\u0023'),
u_(
'\u0024'),
u_(
'\u0025'),
u_(
'\u0026'),
u_(
'\u0027'),
1189 u_(
'\u0028'),
u_(
'\u0029'),
u_(
'\u002a'),
u_(
'\u002b'),
u_(
'\u002c'),
u_(
'\u002d'),
u_(
'\u002e'),
u_(
'\u002f'),
1190 u_(
'\u0030'),
u_(
'\u0031'),
u_(
'\u0032'),
u_(
'\u0033'),
u_(
'\u0034'),
u_(
'\u0035'),
u_(
'\u0036'),
u_(
'\u0037'),
1191 u_(
'\u0038'),
u_(
'\u0039'),
u_(
'\u003a'),
u_(
'\u003b'),
u_(
'\u003c'),
u_(
'\u003d'),
u_(
'\u003e'),
u_(
'\u003f'),
1192 u_(
'\u0040'),
u_(
'\u0041'),
u_(
'\u0042'),
u_(
'\u0043'),
u_(
'\u0044'),
u_(
'\u0045'),
u_(
'\u0046'),
u_(
'\u0047'),
1193 u_(
'\u0048'),
u_(
'\u0049'),
u_(
'\u004a'),
u_(
'\u004b'),
u_(
'\u004c'),
u_(
'\u004d'),
u_(
'\u004e'),
u_(
'\u004f'),
1194 u_(
'\u0050'),
u_(
'\u0051'),
u_(
'\u0052'),
u_(
'\u0053'),
u_(
'\u0054'),
u_(
'\u0055'),
u_(
'\u0056'),
u_(
'\u0057'),
1195 u_(
'\u0058'),
u_(
'\u0059'),
u_(
'\u005a'),
u_(
'\u005b'),
u_(
'\u005c'),
u_(
'\u005d'),
u_(
'\u005e'),
u_(
'\u005f'),
1196 u_(
'\u0060'),
u_(
'\u0061'),
u_(
'\u0062'),
u_(
'\u0063'),
u_(
'\u0064'),
u_(
'\u0065'),
u_(
'\u0066'),
u_(
'\u0067'),
1197 u_(
'\u0068'),
u_(
'\u0069'),
u_(
'\u006a'),
u_(
'\u006b'),
u_(
'\u006c'),
u_(
'\u006d'),
u_(
'\u006e'),
u_(
'\u006f'),
1198 u_(
'\u0070'),
u_(
'\u0071'),
u_(
'\u0072'),
u_(
'\u0073'),
u_(
'\u0074'),
u_(
'\u0075'),
u_(
'\u0076'),
u_(
'\u0077'),
1199 u_(
'\u0078'),
u_(
'\u0079'),
u_(
'\u007a'),
u_(
'\u007b'),
u_(
'\u007c'),
u_(
'\u007d'),
u_(
'\u007e'),
u_(
'\u0000'),
1200 u_(
'\u2022'),
u_(
'\u2020'),
u_(
'\u2021'),
u_(
'\u2026'),
u_(
'\u2014'),
u_(
'\u2013'),
u_(
'\u0192'),
u_(
'\u2044'),
1201 u_(
'\u2039'),
u_(
'\u203a'),
u_(
'\u2212'),
u_(
'\u2030'),
u_(
'\u201e'),
u_(
'\u201c'),
u_(
'\u201d'),
u_(
'\u2018'),
1202 u_(
'\u2019'),
u_(
'\u201a'),
u_(
'\u2122'),
u_(
'\ufb01'),
u_(
'\ufb02'),
u_(
'\u0141'),
u_(
'\u0152'),
u_(
'\u0160'),
1203 u_(
'\u0178'),
u_(
'\u017d'),
u_(
'\u0131'),
u_(
'\u0142'),
u_(
'\u0153'),
u_(
'\u0161'),
u_(
'\u017e'),
u_(
'\u0000'),
1204 u_(
'\u20ac'),
u_(
'\u00a1'),
u_(
'\u00a2'),
u_(
'\u00a3'),
u_(
'\u00a4'),
u_(
'\u00a5'),
u_(
'\u00a6'),
u_(
'\u00a7'),
1205 u_(
'\u00a8'),
u_(
'\u00a9'),
u_(
'\u00aa'),
u_(
'\u00ab'),
u_(
'\u00ac'),
u_(
'\u0000'),
u_(
'\u00ae'),
u_(
'\u00af'),
1206 u_(
'\u00b0'),
u_(
'\u00b1'),
u_(
'\u00b2'),
u_(
'\u00b3'),
u_(
'\u00b4'),
u_(
'\u00b5'),
u_(
'\u00b6'),
u_(
'\u00b7'),
1207 u_(
'\u00b8'),
u_(
'\u00b9'),
u_(
'\u00ba'),
u_(
'\u00bb'),
u_(
'\u00bc'),
u_(
'\u00bd'),
u_(
'\u00be'),
u_(
'\u00bf'),
1208 u_(
'\u00c0'),
u_(
'\u00c1'),
u_(
'\u00c2'),
u_(
'\u00c3'),
u_(
'\u00c4'),
u_(
'\u00c5'),
u_(
'\u00c6'),
u_(
'\u00c7'),
1209 u_(
'\u00c8'),
u_(
'\u00c9'),
u_(
'\u00ca'),
u_(
'\u00cb'),
u_(
'\u00cc'),
u_(
'\u00cd'),
u_(
'\u00ce'),
u_(
'\u00cf'),
1210 u_(
'\u00d0'),
u_(
'\u00d1'),
u_(
'\u00d2'),
u_(
'\u00d3'),
u_(
'\u00d4'),
u_(
'\u00d5'),
u_(
'\u00d6'),
u_(
'\u00d7'),
1211 u_(
'\u00d8'),
u_(
'\u00d9'),
u_(
'\u00da'),
u_(
'\u00db'),
u_(
'\u00dc'),
u_(
'\u00dd'),
u_(
'\u00de'),
u_(
'\u00df'),
1212 u_(
'\u00e0'),
u_(
'\u00e1'),
u_(
'\u00e2'),
u_(
'\u00e3'),
u_(
'\u00e4'),
u_(
'\u00e5'),
u_(
'\u00e6'),
u_(
'\u00e7'),
1213 u_(
'\u00e8'),
u_(
'\u00e9'),
u_(
'\u00ea'),
u_(
'\u00eb'),
u_(
'\u00ec'),
u_(
'\u00ed'),
u_(
'\u00ee'),
u_(
'\u00ef'),
1214 u_(
'\u00f0'),
u_(
'\u00f1'),
u_(
'\u00f2'),
u_(
'\u00f3'),
u_(
'\u00f4'),
u_(
'\u00f5'),
u_(
'\u00f6'),
u_(
'\u00f7'),
1215 u_(
'\u00f8'),
u_(
'\u00f9'),
u_(
'\u00fa'),
u_(
'\u00fb'),
u_(
'\u00fc'),
u_(
'\u00fd'),
u_(
'\u00fe'),
u_(
'\u00ff')
1218 assert len(_pdfDocEncoding) == 256
1220 _pdfDocEncoding_rev = {}
1221 for i
in range(256):
1222 char = _pdfDocEncoding[i]
1223 if char ==
u_(
"\u0000"):
1225 assert char
not in _pdfDocEncoding_rev
1226 _pdfDocEncoding_rev[char] = i