You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

xml_serializer.py 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. """
  2. XML serializer.
  3. """
  4. from xml.dom import pulldom
  5. from xml.sax import handler
  6. from xml.sax.expatreader import ExpatParser as _ExpatParser
  7. from django.apps import apps
  8. from django.conf import settings
  9. from django.core.exceptions import ObjectDoesNotExist
  10. from django.core.serializers import base
  11. from django.db import DEFAULT_DB_ALIAS, models
  12. from django.utils.xmlutils import (
  13. SimplerXMLGenerator, UnserializableContentError,
  14. )
  15. class Serializer(base.Serializer):
  16. """Serialize a QuerySet to XML."""
  17. def indent(self, level):
  18. if self.options.get('indent') is not None:
  19. self.xml.ignorableWhitespace('\n' + ' ' * self.options.get('indent') * level)
  20. def start_serialization(self):
  21. """
  22. Start serialization -- open the XML document and the root element.
  23. """
  24. self.xml = SimplerXMLGenerator(self.stream, self.options.get("encoding", settings.DEFAULT_CHARSET))
  25. self.xml.startDocument()
  26. self.xml.startElement("django-objects", {"version": "1.0"})
  27. def end_serialization(self):
  28. """
  29. End serialization -- end the document.
  30. """
  31. self.indent(0)
  32. self.xml.endElement("django-objects")
  33. self.xml.endDocument()
  34. def start_object(self, obj):
  35. """
  36. Called as each object is handled.
  37. """
  38. if not hasattr(obj, "_meta"):
  39. raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj))
  40. self.indent(1)
  41. attrs = {'model': str(obj._meta)}
  42. if not self.use_natural_primary_keys or not hasattr(obj, 'natural_key'):
  43. obj_pk = obj.pk
  44. if obj_pk is not None:
  45. attrs['pk'] = str(obj_pk)
  46. self.xml.startElement("object", attrs)
  47. def end_object(self, obj):
  48. """
  49. Called after handling all fields for an object.
  50. """
  51. self.indent(1)
  52. self.xml.endElement("object")
  53. def handle_field(self, obj, field):
  54. """
  55. Handle each field on an object (except for ForeignKeys and
  56. ManyToManyFields).
  57. """
  58. self.indent(2)
  59. self.xml.startElement('field', {
  60. 'name': field.name,
  61. 'type': field.get_internal_type(),
  62. })
  63. # Get a "string version" of the object's data.
  64. if getattr(obj, field.name) is not None:
  65. try:
  66. self.xml.characters(field.value_to_string(obj))
  67. except UnserializableContentError:
  68. raise ValueError("%s.%s (pk:%s) contains unserializable characters" % (
  69. obj.__class__.__name__, field.name, obj.pk))
  70. else:
  71. self.xml.addQuickElement("None")
  72. self.xml.endElement("field")
  73. def handle_fk_field(self, obj, field):
  74. """
  75. Handle a ForeignKey (they need to be treated slightly
  76. differently from regular fields).
  77. """
  78. self._start_relational_field(field)
  79. related_att = getattr(obj, field.get_attname())
  80. if related_att is not None:
  81. if self.use_natural_foreign_keys and hasattr(field.remote_field.model, 'natural_key'):
  82. related = getattr(obj, field.name)
  83. # If related object has a natural key, use it
  84. related = related.natural_key()
  85. # Iterable natural keys are rolled out as subelements
  86. for key_value in related:
  87. self.xml.startElement("natural", {})
  88. self.xml.characters(str(key_value))
  89. self.xml.endElement("natural")
  90. else:
  91. self.xml.characters(str(related_att))
  92. else:
  93. self.xml.addQuickElement("None")
  94. self.xml.endElement("field")
  95. def handle_m2m_field(self, obj, field):
  96. """
  97. Handle a ManyToManyField. Related objects are only serialized as
  98. references to the object's PK (i.e. the related *data* is not dumped,
  99. just the relation).
  100. """
  101. if field.remote_field.through._meta.auto_created:
  102. self._start_relational_field(field)
  103. if self.use_natural_foreign_keys and hasattr(field.remote_field.model, 'natural_key'):
  104. # If the objects in the m2m have a natural key, use it
  105. def handle_m2m(value):
  106. natural = value.natural_key()
  107. # Iterable natural keys are rolled out as subelements
  108. self.xml.startElement("object", {})
  109. for key_value in natural:
  110. self.xml.startElement("natural", {})
  111. self.xml.characters(str(key_value))
  112. self.xml.endElement("natural")
  113. self.xml.endElement("object")
  114. else:
  115. def handle_m2m(value):
  116. self.xml.addQuickElement("object", attrs={
  117. 'pk': str(value.pk)
  118. })
  119. for relobj in getattr(obj, field.name).iterator():
  120. handle_m2m(relobj)
  121. self.xml.endElement("field")
  122. def _start_relational_field(self, field):
  123. """Output the <field> element for relational fields."""
  124. self.indent(2)
  125. self.xml.startElement('field', {
  126. 'name': field.name,
  127. 'rel': field.remote_field.__class__.__name__,
  128. 'to': str(field.remote_field.model._meta),
  129. })
  130. class Deserializer(base.Deserializer):
  131. """Deserialize XML."""
  132. def __init__(self, stream_or_string, *, using=DEFAULT_DB_ALIAS, ignorenonexistent=False, **options):
  133. super().__init__(stream_or_string, **options)
  134. self.handle_forward_references = options.pop('handle_forward_references', False)
  135. self.event_stream = pulldom.parse(self.stream, self._make_parser())
  136. self.db = using
  137. self.ignore = ignorenonexistent
  138. def _make_parser(self):
  139. """Create a hardened XML parser (no custom/external entities)."""
  140. return DefusedExpatParser()
  141. def __next__(self):
  142. for event, node in self.event_stream:
  143. if event == "START_ELEMENT" and node.nodeName == "object":
  144. self.event_stream.expandNode(node)
  145. return self._handle_object(node)
  146. raise StopIteration
  147. def _handle_object(self, node):
  148. """Convert an <object> node to a DeserializedObject."""
  149. # Look up the model using the model loading mechanism. If this fails,
  150. # bail.
  151. Model = self._get_model_from_node(node, "model")
  152. # Start building a data dictionary from the object.
  153. data = {}
  154. if node.hasAttribute('pk'):
  155. data[Model._meta.pk.attname] = Model._meta.pk.to_python(
  156. node.getAttribute('pk'))
  157. # Also start building a dict of m2m data (this is saved as
  158. # {m2m_accessor_attribute : [list_of_related_objects]})
  159. m2m_data = {}
  160. deferred_fields = {}
  161. field_names = {f.name for f in Model._meta.get_fields()}
  162. # Deserialize each field.
  163. for field_node in node.getElementsByTagName("field"):
  164. # If the field is missing the name attribute, bail (are you
  165. # sensing a pattern here?)
  166. field_name = field_node.getAttribute("name")
  167. if not field_name:
  168. raise base.DeserializationError("<field> node is missing the 'name' attribute")
  169. # Get the field from the Model. This will raise a
  170. # FieldDoesNotExist if, well, the field doesn't exist, which will
  171. # be propagated correctly unless ignorenonexistent=True is used.
  172. if self.ignore and field_name not in field_names:
  173. continue
  174. field = Model._meta.get_field(field_name)
  175. # As is usually the case, relation fields get the special treatment.
  176. if field.remote_field and isinstance(field.remote_field, models.ManyToManyRel):
  177. value = self._handle_m2m_field_node(field_node, field)
  178. if value == base.DEFER_FIELD:
  179. deferred_fields[field] = [
  180. [
  181. getInnerText(nat_node).strip()
  182. for nat_node in obj_node.getElementsByTagName('natural')
  183. ]
  184. for obj_node in field_node.getElementsByTagName('object')
  185. ]
  186. else:
  187. m2m_data[field.name] = value
  188. elif field.remote_field and isinstance(field.remote_field, models.ManyToOneRel):
  189. value = self._handle_fk_field_node(field_node, field)
  190. if value == base.DEFER_FIELD:
  191. deferred_fields[field] = [
  192. getInnerText(k).strip()
  193. for k in field_node.getElementsByTagName('natural')
  194. ]
  195. else:
  196. data[field.attname] = value
  197. else:
  198. if field_node.getElementsByTagName('None'):
  199. value = None
  200. else:
  201. value = field.to_python(getInnerText(field_node).strip())
  202. data[field.name] = value
  203. obj = base.build_instance(Model, data, self.db)
  204. # Return a DeserializedObject so that the m2m data has a place to live.
  205. return base.DeserializedObject(obj, m2m_data, deferred_fields)
  206. def _handle_fk_field_node(self, node, field):
  207. """
  208. Handle a <field> node for a ForeignKey
  209. """
  210. # Check if there is a child node named 'None', returning None if so.
  211. if node.getElementsByTagName('None'):
  212. return None
  213. else:
  214. model = field.remote_field.model
  215. if hasattr(model._default_manager, 'get_by_natural_key'):
  216. keys = node.getElementsByTagName('natural')
  217. if keys:
  218. # If there are 'natural' subelements, it must be a natural key
  219. field_value = [getInnerText(k).strip() for k in keys]
  220. try:
  221. obj = model._default_manager.db_manager(self.db).get_by_natural_key(*field_value)
  222. except ObjectDoesNotExist:
  223. if self.handle_forward_references:
  224. return base.DEFER_FIELD
  225. else:
  226. raise
  227. obj_pk = getattr(obj, field.remote_field.field_name)
  228. # If this is a natural foreign key to an object that
  229. # has a FK/O2O as the foreign key, use the FK value
  230. if field.remote_field.model._meta.pk.remote_field:
  231. obj_pk = obj_pk.pk
  232. else:
  233. # Otherwise, treat like a normal PK
  234. field_value = getInnerText(node).strip()
  235. obj_pk = model._meta.get_field(field.remote_field.field_name).to_python(field_value)
  236. return obj_pk
  237. else:
  238. field_value = getInnerText(node).strip()
  239. return model._meta.get_field(field.remote_field.field_name).to_python(field_value)
  240. def _handle_m2m_field_node(self, node, field):
  241. """
  242. Handle a <field> node for a ManyToManyField.
  243. """
  244. model = field.remote_field.model
  245. default_manager = model._default_manager
  246. if hasattr(default_manager, 'get_by_natural_key'):
  247. def m2m_convert(n):
  248. keys = n.getElementsByTagName('natural')
  249. if keys:
  250. # If there are 'natural' subelements, it must be a natural key
  251. field_value = [getInnerText(k).strip() for k in keys]
  252. obj_pk = default_manager.db_manager(self.db).get_by_natural_key(*field_value).pk
  253. else:
  254. # Otherwise, treat like a normal PK value.
  255. obj_pk = model._meta.pk.to_python(n.getAttribute('pk'))
  256. return obj_pk
  257. else:
  258. def m2m_convert(n):
  259. return model._meta.pk.to_python(n.getAttribute('pk'))
  260. values = []
  261. try:
  262. for c in node.getElementsByTagName('object'):
  263. values.append(m2m_convert(c))
  264. except Exception as e:
  265. if isinstance(e, ObjectDoesNotExist) and self.handle_forward_references:
  266. return base.DEFER_FIELD
  267. else:
  268. raise base.M2MDeserializationError(e, c)
  269. else:
  270. return values
  271. def _get_model_from_node(self, node, attr):
  272. """
  273. Look up a model from a <object model=...> or a <field rel=... to=...>
  274. node.
  275. """
  276. model_identifier = node.getAttribute(attr)
  277. if not model_identifier:
  278. raise base.DeserializationError(
  279. "<%s> node is missing the required '%s' attribute"
  280. % (node.nodeName, attr))
  281. try:
  282. return apps.get_model(model_identifier)
  283. except (LookupError, TypeError):
  284. raise base.DeserializationError(
  285. "<%s> node has invalid model identifier: '%s'"
  286. % (node.nodeName, model_identifier))
  287. def getInnerText(node):
  288. """Get all the inner text of a DOM node (recursively)."""
  289. # inspired by https://mail.python.org/pipermail/xml-sig/2005-March/011022.html
  290. inner_text = []
  291. for child in node.childNodes:
  292. if child.nodeType == child.TEXT_NODE or child.nodeType == child.CDATA_SECTION_NODE:
  293. inner_text.append(child.data)
  294. elif child.nodeType == child.ELEMENT_NODE:
  295. inner_text.extend(getInnerText(child))
  296. else:
  297. pass
  298. return "".join(inner_text)
  299. # Below code based on Christian Heimes' defusedxml
  300. class DefusedExpatParser(_ExpatParser):
  301. """
  302. An expat parser hardened against XML bomb attacks.
  303. Forbid DTDs, external entity references
  304. """
  305. def __init__(self, *args, **kwargs):
  306. super().__init__(*args, **kwargs)
  307. self.setFeature(handler.feature_external_ges, False)
  308. self.setFeature(handler.feature_external_pes, False)
  309. def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
  310. raise DTDForbidden(name, sysid, pubid)
  311. def entity_decl(self, name, is_parameter_entity, value, base,
  312. sysid, pubid, notation_name):
  313. raise EntitiesForbidden(name, value, base, sysid, pubid, notation_name)
  314. def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
  315. # expat 1.2
  316. raise EntitiesForbidden(name, None, base, sysid, pubid, notation_name)
  317. def external_entity_ref_handler(self, context, base, sysid, pubid):
  318. raise ExternalReferenceForbidden(context, base, sysid, pubid)
  319. def reset(self):
  320. _ExpatParser.reset(self)
  321. parser = self._parser
  322. parser.StartDoctypeDeclHandler = self.start_doctype_decl
  323. parser.EntityDeclHandler = self.entity_decl
  324. parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
  325. parser.ExternalEntityRefHandler = self.external_entity_ref_handler
  326. class DefusedXmlException(ValueError):
  327. """Base exception."""
  328. def __repr__(self):
  329. return str(self)
  330. class DTDForbidden(DefusedXmlException):
  331. """Document type definition is forbidden."""
  332. def __init__(self, name, sysid, pubid):
  333. super().__init__()
  334. self.name = name
  335. self.sysid = sysid
  336. self.pubid = pubid
  337. def __str__(self):
  338. tpl = "DTDForbidden(name='{}', system_id={!r}, public_id={!r})"
  339. return tpl.format(self.name, self.sysid, self.pubid)
  340. class EntitiesForbidden(DefusedXmlException):
  341. """Entity definition is forbidden."""
  342. def __init__(self, name, value, base, sysid, pubid, notation_name):
  343. super().__init__()
  344. self.name = name
  345. self.value = value
  346. self.base = base
  347. self.sysid = sysid
  348. self.pubid = pubid
  349. self.notation_name = notation_name
  350. def __str__(self):
  351. tpl = "EntitiesForbidden(name='{}', system_id={!r}, public_id={!r})"
  352. return tpl.format(self.name, self.sysid, self.pubid)
  353. class ExternalReferenceForbidden(DefusedXmlException):
  354. """Resolving an external reference is forbidden."""
  355. def __init__(self, context, base, sysid, pubid):
  356. super().__init__()
  357. self.context = context
  358. self.base = base
  359. self.sysid = sysid
  360. self.pubid = pubid
  361. def __str__(self):
  362. tpl = "ExternalReferenceForbidden(system_id='{}', public_id={})"
  363. return tpl.format(self.sysid, self.pubid)