4 from .generic
import PdfObject
5 from xml.dom
import getDOMImplementation
6 from xml.dom.minidom
import parseString
9 RDF_NAMESPACE =
"http://www.w3.org/1999/02/22-rdf-syntax-ns#"
10 DC_NAMESPACE =
"http://purl.org/dc/elements/1.1/"
11 XMP_NAMESPACE =
"http://ns.adobe.com/xap/1.0/"
12 PDF_NAMESPACE =
"http://ns.adobe.com/pdf/1.3/"
13 XMPMM_NAMESPACE =
"http://ns.adobe.com/xap/1.0/mm/"
35 PDFX_NAMESPACE =
"http://ns.adobe.com/pdfx/1.3/"
37 iso8601 = re.compile(
"""
46 (:(?P<second>[0-9]{2}(.[0-9]+)?))?
47 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
56 An object that represents Adobe XMP metadata.
57 Usually accessed by :meth:`getXmpMetadata()<PyPDF2.PdfFileReader.getXmpMetadata>`
62 docRoot = parseString(self.
stream.getData())
63 self.
rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE,
"RDF")[0]
70 for desc
in self.
rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE,
"Description"):
71 if desc.getAttributeNS(RDF_NAMESPACE,
"about") == aboutUri:
72 attr = desc.getAttributeNodeNS(namespace, name)
75 for element
in desc.getElementsByTagNameNS(namespace, name):
79 for desc
in self.
rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE,
"Description"):
80 if desc.getAttributeNS(RDF_NAMESPACE,
"about") == aboutUri:
81 for i
in range(desc.attributes.length):
82 attr = desc.attributes.item(i)
83 if attr.namespaceURI == namespace:
85 for child
in desc.childNodes:
86 if child.namespaceURI == namespace:
89 def _getText(self, element):
91 for child
in element.childNodes:
92 if child.nodeType == child.TEXT_NODE:
96 def _converter_string(value):
99 def _converter_date(value):
100 m = iso8601.match(value)
101 year = int(m.group(
"year"))
102 month = int(m.group(
"month")
or "1")
103 day = int(m.group(
"day")
or "1")
104 hour = int(m.group(
"hour")
or "0")
105 minute = int(m.group(
"minute")
or "0")
106 second = decimal.Decimal(m.group(
"second")
or "0")
107 seconds = second.to_integral(decimal.ROUND_FLOOR)
108 milliseconds = (second - seconds) * 1000000
109 tzd = m.group(
"tzd")
or "Z"
110 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
112 tzd_hours, tzd_minutes = [int(x)
for x
in tzd.split(
":")]
116 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
118 _test_converter_date = staticmethod(_converter_date)
120 def _getter_bag(namespace, name, converter):
122 cached = self.
cache.get(namespace, {}).get(name)
126 for element
in self.
getElement(
"", namespace, name):
127 bags = element.getElementsByTagNameNS(RDF_NAMESPACE,
"Bag")
130 for item
in bag.getElementsByTagNameNS(RDF_NAMESPACE,
"li"):
132 value = converter(value)
134 ns_cache = self.
cache.setdefault(namespace, {})
135 ns_cache[name] = retval
139 def _getter_seq(namespace, name, converter):
141 cached = self.
cache.get(namespace, {}).get(name)
145 for element
in self.
getElement(
"", namespace, name):
146 seqs = element.getElementsByTagNameNS(RDF_NAMESPACE,
"Seq")
149 for item
in seq.getElementsByTagNameNS(RDF_NAMESPACE,
"li"):
151 value = converter(value)
154 value = converter(self.
_getText(element))
156 ns_cache = self.
cache.setdefault(namespace, {})
157 ns_cache[name] = retval
161 def _getter_langalt(namespace, name, converter):
163 cached = self.
cache.get(namespace, {}).get(name)
167 for element
in self.
getElement(
"", namespace, name):
168 alts = element.getElementsByTagNameNS(RDF_NAMESPACE,
"Alt")
171 for item
in alt.getElementsByTagNameNS(RDF_NAMESPACE,
"li"):
173 value = converter(value)
174 retval[item.getAttribute(
"xml:lang")] = value
176 retval[
"x-default"] = converter(self.
_getText(element))
177 ns_cache = self.
cache.setdefault(namespace, {})
178 ns_cache[name] = retval
182 def _getter_single(namespace, name, converter):
184 cached = self.
cache.get(namespace, {}).get(name)
188 for element
in self.
getElement(
"", namespace, name):
189 if element.nodeType == element.ATTRIBUTE_NODE:
190 value = element.nodeValue
195 value = converter(value)
196 ns_cache = self.
cache.setdefault(namespace, {})
197 ns_cache[name] = value
201 dc_contributor = property(_getter_bag(DC_NAMESPACE,
"contributor", _converter_string))
203 Contributors to the resource (other than the authors). An unsorted
207 dc_coverage = property(_getter_single(DC_NAMESPACE,
"coverage", _converter_string))
209 Text describing the extent or scope of the resource.
212 dc_creator = property(_getter_seq(DC_NAMESPACE,
"creator", _converter_string))
214 A sorted array of names of the authors of the resource, listed in order
218 dc_date = property(_getter_seq(DC_NAMESPACE,
"date", _converter_date))
220 A sorted array of dates (datetime.datetime instances) of signifigance to
221 the resource. The dates and times are in UTC.
224 dc_description = property(_getter_langalt(DC_NAMESPACE,
"description", _converter_string))
226 A language-keyed dictionary of textual descriptions of the content of the
230 dc_format = property(_getter_single(DC_NAMESPACE,
"format", _converter_string))
232 The mime-type of the resource.
235 dc_identifier = property(_getter_single(DC_NAMESPACE,
"identifier", _converter_string))
237 Unique identifier of the resource.
240 dc_language = property(_getter_bag(DC_NAMESPACE,
"language", _converter_string))
242 An unordered array specifying the languages used in the resource.
245 dc_publisher = property(_getter_bag(DC_NAMESPACE,
"publisher", _converter_string))
247 An unordered array of publisher names.
250 dc_relation = property(_getter_bag(DC_NAMESPACE,
"relation", _converter_string))
252 An unordered array of text descriptions of relationships to other
256 dc_rights = property(_getter_langalt(DC_NAMESPACE,
"rights", _converter_string))
258 A language-keyed dictionary of textual descriptions of the rights the
259 user has to this resource.
262 dc_source = property(_getter_single(DC_NAMESPACE,
"source", _converter_string))
264 Unique identifier of the work from which this resource was derived.
267 dc_subject = property(_getter_bag(DC_NAMESPACE,
"subject", _converter_string))
269 An unordered array of descriptive phrases or keywrods that specify the
270 topic of the content of the resource.
273 dc_title = property(_getter_langalt(DC_NAMESPACE,
"title", _converter_string))
275 A language-keyed dictionary of the title of the resource.
278 dc_type = property(_getter_bag(DC_NAMESPACE,
"type", _converter_string))
280 An unordered array of textual descriptions of the document type.
283 pdf_keywords = property(_getter_single(PDF_NAMESPACE,
"Keywords", _converter_string))
285 An unformatted text string representing document keywords.
288 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE,
"PDFVersion", _converter_string))
290 The PDF file version, for example 1.0, 1.3.
293 pdf_producer = property(_getter_single(PDF_NAMESPACE,
"Producer", _converter_string))
295 The name of the tool that created the PDF document.
298 xmp_createDate = property(_getter_single(XMP_NAMESPACE,
"CreateDate", _converter_date))
300 The date and time the resource was originally created. The date and
301 time are returned as a UTC datetime.datetime object.
304 xmp_modifyDate = property(_getter_single(XMP_NAMESPACE,
"ModifyDate", _converter_date))
306 The date and time the resource was last modified. The date and time
307 are returned as a UTC datetime.datetime object.
310 xmp_metadataDate = property(_getter_single(XMP_NAMESPACE,
"MetadataDate", _converter_date))
312 The date and time that any metadata for this resource was last
313 changed. The date and time are returned as a UTC datetime.datetime
317 xmp_creatorTool = property(_getter_single(XMP_NAMESPACE,
"CreatorTool", _converter_string))
319 The name of the first known tool used to create the resource.
322 xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE,
"DocumentID", _converter_string))
324 The common identifier for all versions and renditions of this resource.
327 xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE,
"InstanceID", _converter_string))
329 An identifier for a specific incarnation of a document, updated each
330 time a file is saved.
334 if not hasattr(self,
"_custom_properties"):
340 idx = key.find(
u_(
"\u2182"))
343 key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
344 if node.nodeType == node.ATTRIBUTE_NODE:
345 value = node.nodeValue
351 custom_properties = property(custom_properties)
353 Retrieves custom metadata properties defined in the undocumented pdfx
356 :return: a dictionary of key/value items for custom metadata properties.