LeenO computo metrico con LibreOffice  3.22.0
Il software libero per la gestione di computi metrici e contabilità lavori.
xmp.py
Vai alla documentazione di questo file.
1 import re
2 import datetime
3 import decimal
4 from .generic import PdfObject
5 from xml.dom import getDOMImplementation
6 from xml.dom.minidom import parseString
7 from .utils import u_
8 
9 RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
10 DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
11 XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
12 PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
13 XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
14 
15 # What is the PDFX namespace, you might ask? I might ask that too. It's
16 # a completely undocumented namespace used to place "custom metadata"
17 # properties, which are arbitrary metadata properties with no semantic or
18 # documented meaning. Elements in the namespace are key/value-style storage,
19 # where the element name is the key and the content is the value. The keys
20 # are transformed into valid XML identifiers by substituting an invalid
21 # identifier character with \u2182 followed by the unicode hex ID of the
22 # original character. A key like "my car" is therefore "my\u21820020car".
23 #
24 # \u2182, in case you're wondering, is the unicode character
25 # \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
26 # escaping characters.
27 #
28 # Intentional users of the pdfx namespace should be shot on sight. A
29 # custom data schema and sensical XML elements could be used instead, as is
30 # suggested by Adobe's own documentation on XMP (under "Extensibility of
31 # Schemas").
32 #
33 # Information presented here on the /pdfx/ schema is a result of limited
34 # reverse engineering, and does not constitute a full specification.
35 PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
36 
37 iso8601 = re.compile("""
38  (?P<year>[0-9]{4})
39  (-
40  (?P<month>[0-9]{2})
41  (-
42  (?P<day>[0-9]+)
43  (T
44  (?P<hour>[0-9]{2}):
45  (?P<minute>[0-9]{2})
46  (:(?P<second>[0-9]{2}(.[0-9]+)?))?
47  (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
48  )?
49  )?
50  )?
51  """, re.VERBOSE)
52 
53 
55  """
56  An object that represents Adobe XMP metadata.
57  Usually accessed by :meth:`getXmpMetadata()<PyPDF2.PdfFileReader.getXmpMetadata>`
58  """
59 
60  def __init__(self, stream):
61  self.stream = stream
62  docRoot = parseString(self.stream.getData())
63  self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
64  self.cache = {}
65 
66  def writeToStream(self, stream, encryption_key):
67  self.stream.writeToStream(stream, encryption_key)
68 
69  def getElement(self, aboutUri, namespace, name):
70  for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
71  if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
72  attr = desc.getAttributeNodeNS(namespace, name)
73  if attr != None:
74  yield attr
75  for element in desc.getElementsByTagNameNS(namespace, name):
76  yield element
77 
78  def getNodesInNamespace(self, aboutUri, namespace):
79  for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
80  if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
81  for i in range(desc.attributes.length):
82  attr = desc.attributes.item(i)
83  if attr.namespaceURI == namespace:
84  yield attr
85  for child in desc.childNodes:
86  if child.namespaceURI == namespace:
87  yield child
88 
89  def _getText(self, element):
90  text = ""
91  for child in element.childNodes:
92  if child.nodeType == child.TEXT_NODE:
93  text += child.data
94  return text
95 
96  def _converter_string(value):
97  return value
98 
99  def _converter_date(value):
100  m = iso8601.match(value)
101  year = int(m.group("year"))
102  month = int(m.group("month") or "1")
103  day = int(m.group("day") or "1")
104  hour = int(m.group("hour") or "0")
105  minute = int(m.group("minute") or "0")
106  second = decimal.Decimal(m.group("second") or "0")
107  seconds = second.to_integral(decimal.ROUND_FLOOR)
108  milliseconds = (second - seconds) * 1000000
109  tzd = m.group("tzd") or "Z"
110  dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
111  if tzd != "Z":
112  tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
113  tzd_hours *= -1
114  if tzd_hours < 0:
115  tzd_minutes *= -1
116  dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
117  return dt
118  _test_converter_date = staticmethod(_converter_date)
119 
120  def _getter_bag(namespace, name, converter):
121  def get(self):
122  cached = self.cache.get(namespace, {}).get(name)
123  if cached:
124  return cached
125  retval = []
126  for element in self.getElement("", namespace, name):
127  bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
128  if len(bags):
129  for bag in bags:
130  for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
131  value = self._getText(item)
132  value = converter(value)
133  retval.append(value)
134  ns_cache = self.cache.setdefault(namespace, {})
135  ns_cache[name] = retval
136  return retval
137  return get
138 
139  def _getter_seq(namespace, name, converter):
140  def get(self):
141  cached = self.cache.get(namespace, {}).get(name)
142  if cached:
143  return cached
144  retval = []
145  for element in self.getElement("", namespace, name):
146  seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
147  if len(seqs):
148  for seq in seqs:
149  for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
150  value = self._getText(item)
151  value = converter(value)
152  retval.append(value)
153  else:
154  value = converter(self._getText(element))
155  retval.append(value)
156  ns_cache = self.cache.setdefault(namespace, {})
157  ns_cache[name] = retval
158  return retval
159  return get
160 
161  def _getter_langalt(namespace, name, converter):
162  def get(self):
163  cached = self.cache.get(namespace, {}).get(name)
164  if cached:
165  return cached
166  retval = {}
167  for element in self.getElement("", namespace, name):
168  alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
169  if len(alts):
170  for alt in alts:
171  for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
172  value = self._getText(item)
173  value = converter(value)
174  retval[item.getAttribute("xml:lang")] = value
175  else:
176  retval["x-default"] = converter(self._getText(element))
177  ns_cache = self.cache.setdefault(namespace, {})
178  ns_cache[name] = retval
179  return retval
180  return get
181 
182  def _getter_single(namespace, name, converter):
183  def get(self):
184  cached = self.cache.get(namespace, {}).get(name)
185  if cached:
186  return cached
187  value = None
188  for element in self.getElement("", namespace, name):
189  if element.nodeType == element.ATTRIBUTE_NODE:
190  value = element.nodeValue
191  else:
192  value = self._getText(element)
193  break
194  if value != None:
195  value = converter(value)
196  ns_cache = self.cache.setdefault(namespace, {})
197  ns_cache[name] = value
198  return value
199  return get
200 
201  dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
202  """
203  Contributors to the resource (other than the authors). An unsorted
204  array of names.
205  """
206 
207  dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
208  """
209  Text describing the extent or scope of the resource.
210  """
211 
212  dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
213  """
214  A sorted array of names of the authors of the resource, listed in order
215  of precedence.
216  """
217 
218  dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
219  """
220  A sorted array of dates (datetime.datetime instances) of signifigance to
221  the resource. The dates and times are in UTC.
222  """
223 
224  dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
225  """
226  A language-keyed dictionary of textual descriptions of the content of the
227  resource.
228  """
229 
230  dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
231  """
232  The mime-type of the resource.
233  """
234 
235  dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
236  """
237  Unique identifier of the resource.
238  """
239 
240  dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
241  """
242  An unordered array specifying the languages used in the resource.
243  """
244 
245  dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
246  """
247  An unordered array of publisher names.
248  """
249 
250  dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
251  """
252  An unordered array of text descriptions of relationships to other
253  documents.
254  """
255 
256  dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
257  """
258  A language-keyed dictionary of textual descriptions of the rights the
259  user has to this resource.
260  """
261 
262  dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
263  """
264  Unique identifier of the work from which this resource was derived.
265  """
266 
267  dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
268  """
269  An unordered array of descriptive phrases or keywrods that specify the
270  topic of the content of the resource.
271  """
272 
273  dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
274  """
275  A language-keyed dictionary of the title of the resource.
276  """
277 
278  dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
279  """
280  An unordered array of textual descriptions of the document type.
281  """
282 
283  pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
284  """
285  An unformatted text string representing document keywords.
286  """
287 
288  pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
289  """
290  The PDF file version, for example 1.0, 1.3.
291  """
292 
293  pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
294  """
295  The name of the tool that created the PDF document.
296  """
297 
298  xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
299  """
300  The date and time the resource was originally created. The date and
301  time are returned as a UTC datetime.datetime object.
302  """
303 
304  xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
305  """
306  The date and time the resource was last modified. The date and time
307  are returned as a UTC datetime.datetime object.
308  """
309 
310  xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
311  """
312  The date and time that any metadata for this resource was last
313  changed. The date and time are returned as a UTC datetime.datetime
314  object.
315  """
316 
317  xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
318  """
319  The name of the first known tool used to create the resource.
320  """
321 
322  xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
323  """
324  The common identifier for all versions and renditions of this resource.
325  """
326 
327  xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
328  """
329  An identifier for a specific incarnation of a document, updated each
330  time a file is saved.
331  """
332 
333  def custom_properties(self):
334  if not hasattr(self, "_custom_properties"):
335  self._custom_properties = {}
336  for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
337  key = node.localName
338  while True:
339  # see documentation about PDFX_NAMESPACE earlier in file
340  idx = key.find(u_("\u2182"))
341  if idx == -1:
342  break
343  key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
344  if node.nodeType == node.ATTRIBUTE_NODE:
345  value = node.nodeValue
346  else:
347  value = self._getText(node)
348  self._custom_properties[key] = value
349  return self._custom_properties
350 
351  custom_properties = property(custom_properties)
352  """
353  Retrieves custom metadata properties defined in the undocumented pdfx
354  metadata schema.
355 
356  :return: a dictionary of key/value items for custom metadata properties.
357  :rtype: dict
358  """
utils.u_
def u_(s)
Definition: utils.py:244
xmp.XmpInformation.writeToStream
def writeToStream(self, stream, encryption_key)
Definition: xmp.py:66
xmp.XmpInformation._custom_properties
_custom_properties
Definition: xmp.py:335
xmp.XmpInformation
Definition: xmp.py:54
xmp.XmpInformation.stream
stream
Definition: xmp.py:61
xmp.XmpInformation._getText
def _getText(self, element)
Definition: xmp.py:89
xmp.XmpInformation.rdfRoot
rdfRoot
Definition: xmp.py:63
xmp.XmpInformation.__init__
def __init__(self, stream)
Definition: xmp.py:60
xmp.XmpInformation.getElement
def getElement(self, aboutUri, namespace, name)
Definition: xmp.py:69
xmp.XmpInformation.getNodesInNamespace
def getNodesInNamespace(self, aboutUri, namespace)
Definition: xmp.py:78
generic.PdfObject
Definition: generic.py:101
xmp.XmpInformation.custom_properties
custom_properties
Definition: xmp.py:351
xmp.XmpInformation.cache
cache
Definition: xmp.py:64