Development of an internal social media platform with personalised dashboards for students
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long. 35KB

  1. import calendar
  2. import codecs
  3. import collections
  4. import mmap
  5. import os
  6. import re
  7. import time
  8. import zlib
  9. from ._util import py3
  10. try:
  11. from UserDict import UserDict # Python 2.x
  12. except ImportError:
  13. UserDict = collections.UserDict # Python 3.x
  14. if py3: # Python 3.x
  15. def make_bytes(s):
  16. return s.encode("us-ascii")
  17. else: # Python 2.x
  18. def make_bytes(s): # pragma: no cover
  19. return s # pragma: no cover
  20. # see Text String Type on page 86 and D.3 PDFDocEncoding Character Set
  21. # on page 656
  22. def encode_text(s):
  23. return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
  24. PDFDocEncoding = {
  25. 0x16: u"\u0017",
  26. 0x18: u"\u02D8",
  27. 0x19: u"\u02C7",
  28. 0x1A: u"\u02C6",
  29. 0x1B: u"\u02D9",
  30. 0x1C: u"\u02DD",
  31. 0x1D: u"\u02DB",
  32. 0x1E: u"\u02DA",
  33. 0x1F: u"\u02DC",
  34. 0x80: u"\u2022",
  35. 0x81: u"\u2020",
  36. 0x82: u"\u2021",
  37. 0x83: u"\u2026",
  38. 0x84: u"\u2014",
  39. 0x85: u"\u2013",
  40. 0x86: u"\u0192",
  41. 0x87: u"\u2044",
  42. 0x88: u"\u2039",
  43. 0x89: u"\u203A",
  44. 0x8A: u"\u2212",
  45. 0x8B: u"\u2030",
  46. 0x8C: u"\u201E",
  47. 0x8D: u"\u201C",
  48. 0x8E: u"\u201D",
  49. 0x8F: u"\u2018",
  50. 0x90: u"\u2019",
  51. 0x91: u"\u201A",
  52. 0x92: u"\u2122",
  53. 0x93: u"\uFB01",
  54. 0x94: u"\uFB02",
  55. 0x95: u"\u0141",
  56. 0x96: u"\u0152",
  57. 0x97: u"\u0160",
  58. 0x98: u"\u0178",
  59. 0x99: u"\u017D",
  60. 0x9A: u"\u0131",
  61. 0x9B: u"\u0142",
  62. 0x9C: u"\u0153",
  63. 0x9D: u"\u0161",
  64. 0x9E: u"\u017E",
  65. 0xA0: u"\u20AC",
  66. }
  67. def decode_text(b):
  68. if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
  69. return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be")
  70. elif py3: # Python 3.x
  71. return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b)
  72. else: # Python 2.x
  73. return u"".join(PDFDocEncoding.get(ord(byte), byte) for byte in b)
  74. class PdfFormatError(RuntimeError):
  75. """An error that probably indicates a syntactic or semantic error in the
  76. PDF file structure"""
  77. pass
  78. def check_format_condition(condition, error_message):
  79. if not condition:
  80. raise PdfFormatError(error_message)
  81. class IndirectReference(collections.namedtuple("IndirectReferenceTuple",
  82. ["object_id", "generation"])):
  83. def __str__(self):
  84. return "%s %s R" % self
  85. def __bytes__(self):
  86. return self.__str__().encode("us-ascii")
  87. def __eq__(self, other):
  88. return other.__class__ is self.__class__ and \
  89. other.object_id == self.object_id and \
  90. other.generation == self.generation
  91. def __ne__(self, other):
  92. return not (self == other)
  93. def __hash__(self):
  94. return hash((self.object_id, self.generation))
  95. class IndirectObjectDef(IndirectReference):
  96. def __str__(self):
  97. return "%s %s obj" % self
  98. class XrefTable:
  99. def __init__(self):
  100. self.existing_entries = {} # object ID => (offset, generation)
  101. self.new_entries = {} # object ID => (offset, generation)
  102. self.deleted_entries = {0: 65536} # object ID => generation
  103. self.reading_finished = False
  104. def __setitem__(self, key, value):
  105. if self.reading_finished:
  106. self.new_entries[key] = value
  107. else:
  108. self.existing_entries[key] = value
  109. if key in self.deleted_entries:
  110. del self.deleted_entries[key]
  111. def __getitem__(self, key):
  112. try:
  113. return self.new_entries[key]
  114. except KeyError:
  115. return self.existing_entries[key]
  116. def __delitem__(self, key):
  117. if key in self.new_entries:
  118. generation = self.new_entries[key][1] + 1
  119. del self.new_entries[key]
  120. self.deleted_entries[key] = generation
  121. elif key in self.existing_entries:
  122. generation = self.existing_entries[key][1] + 1
  123. self.deleted_entries[key] = generation
  124. elif key in self.deleted_entries:
  125. generation = self.deleted_entries[key]
  126. else:
  127. raise IndexError("object ID " + str(key) +
  128. " cannot be deleted because it doesn't exist")
  129. def __contains__(self, key):
  130. return key in self.existing_entries or key in self.new_entries
  131. def __len__(self):
  132. return len(set(self.existing_entries.keys()) |
  133. set(self.new_entries.keys()) |
  134. set(self.deleted_entries.keys()))
  135. def keys(self):
  136. return (
  137. set(self.existing_entries.keys()) -
  138. set(self.deleted_entries.keys())
  139. ) | set(self.new_entries.keys())
  140. def write(self, f):
  141. keys = sorted(set(self.new_entries.keys()) |
  142. set(self.deleted_entries.keys()))
  143. deleted_keys = sorted(set(self.deleted_entries.keys()))
  144. startxref = f.tell()
  145. f.write(b"xref\n")
  146. while keys:
  147. # find a contiguous sequence of object IDs
  148. prev = None
  149. for index, key in enumerate(keys):
  150. if prev is None or prev+1 == key:
  151. prev = key
  152. else:
  153. contiguous_keys = keys[:index]
  154. keys = keys[index:]
  155. break
  156. else:
  157. contiguous_keys = keys
  158. keys = None
  159. f.write(make_bytes("%d %d\n" %
  160. (contiguous_keys[0], len(contiguous_keys))))
  161. for object_id in contiguous_keys:
  162. if object_id in self.new_entries:
  163. f.write(make_bytes("%010d %05d n \n" %
  164. self.new_entries[object_id]))
  165. else:
  166. this_deleted_object_id = deleted_keys.pop(0)
  167. check_format_condition(object_id == this_deleted_object_id,
  168. "expected the next deleted object "
  169. "ID to be %s, instead found %s" %
  170. (object_id, this_deleted_object_id))
  171. try:
  172. next_in_linked_list = deleted_keys[0]
  173. except IndexError:
  174. next_in_linked_list = 0
  175. f.write(make_bytes("%010d %05d f \n" %
  176. (next_in_linked_list,
  177. self.deleted_entries[object_id])))
  178. return startxref
  179. class PdfName:
  180. def __init__(self, name):
  181. if isinstance(name, PdfName):
  182. =
  183. elif isinstance(name, bytes):
  184. = name
  185. else:
  186. = name.encode("us-ascii")
  187. def name_as_str(self):
  188. return"us-ascii")
  189. def __eq__(self, other):
  190. return (isinstance(other, PdfName) and == or \
  191. other ==
  192. def __hash__(self):
  193. return hash(
  194. def __repr__(self):
  195. return "PdfName(%s)" % repr(
  196. @classmethod
  197. def from_pdf_stream(cls, data):
  198. return cls(PdfParser.interpret_name(data))
  199. allowed_chars = set(range(33, 127)) - set(ord(c) for c in "#%/()<>[]{}")
  200. def __bytes__(self):
  201. result = bytearray(b"/")
  202. for b in
  203. if py3: # Python 3.x
  204. if b in self.allowed_chars:
  205. result.append(b)
  206. else:
  207. result.extend(make_bytes("#%02X" % b))
  208. else: # Python 2.x
  209. if ord(b) in self.allowed_chars:
  210. result.append(b)
  211. else:
  212. result.extend(b"#%02X" % ord(b))
  213. return bytes(result)
  214. __str__ = __bytes__
  215. class PdfArray(list):
  216. def __bytes__(self):
  217. return b"[ " + b" ".join(pdf_repr(x) for x in self) + b" ]"
  218. __str__ = __bytes__
  219. class PdfDict(UserDict):
  220. def __setattr__(self, key, value):
  221. if key == "data":
  222. if hasattr(UserDict, "__setattr__"):
  223. UserDict.__setattr__(self, key, value)
  224. else:
  225. self.__dict__[key] = value
  226. else:
  227. if isinstance(key, str):
  228. key = key.encode("us-ascii")
  229. self[key] = value
  230. def __getattr__(self, key):
  231. try:
  232. value = self[key]
  233. except KeyError:
  234. try:
  235. value = self[key.encode("us-ascii")]
  236. except KeyError:
  237. raise AttributeError(key)
  238. if isinstance(value, bytes):
  239. value = decode_text(value)
  240. if key.endswith("Date"):
  241. if value.startswith("D:"):
  242. value = value[2:]
  243. relationship = 'Z'
  244. if len(value) > 17:
  245. relationship = value[14]
  246. offset = int(value[15:17]) * 60
  247. if len(value) > 20:
  248. offset += int(value[18:20])
  249. format = '%Y%m%d%H%M%S'[:len(value) - 2]
  250. value = time.strptime(value[:len(format)+2], format)
  251. if relationship in ['+', '-']:
  252. offset *= 60
  253. if relationship == '+':
  254. offset *= -1
  255. value = time.gmtime(calendar.timegm(value) + offset)
  256. return value
  257. def __bytes__(self):
  258. out = bytearray(b"<<")
  259. for key, value in self.items():
  260. if value is None:
  261. continue
  262. value = pdf_repr(value)
  263. out.extend(b"\n")
  264. out.extend(bytes(PdfName(key)))
  265. out.extend(b" ")
  266. out.extend(value)
  267. out.extend(b"\n>>")
  268. return bytes(out)
  269. if not py3:
  270. __str__ = __bytes__
  271. class PdfBinary:
  272. def __init__(self, data):
  273. = data
  274. if py3: # Python 3.x
  275. def __bytes__(self):
  276. return make_bytes("<%s>" % "".join("%02X" % b for b in
  277. else: # Python 2.x
  278. def __str__(self):
  279. return "<%s>" % "".join("%02X" % ord(b) for b in
  280. class PdfStream:
  281. def __init__(self, dictionary, buf):
  282. self.dictionary = dictionary
  283. self.buf = buf
  284. def decode(self):
  285. try:
  286. filter = self.dictionary.Filter
  287. except AttributeError:
  288. return self.buf
  289. if filter == b"FlateDecode":
  290. try:
  291. expected_length = self.dictionary.DL
  292. except AttributeError:
  293. expected_length = self.dictionary.Length
  294. return zlib.decompress(self.buf, bufsize=int(expected_length))
  295. else:
  296. raise NotImplementedError(
  297. "stream filter %s unknown/unsupported" %
  298. repr(self.dictionary.Filter))
  299. def pdf_repr(x):
  300. if x is True:
  301. return b"true"
  302. elif x is False:
  303. return b"false"
  304. elif x is None:
  305. return b"null"
  306. elif (isinstance(x, PdfName) or isinstance(x, PdfDict) or
  307. isinstance(x, PdfArray) or isinstance(x, PdfBinary)):
  308. return bytes(x)
  309. elif isinstance(x, int):
  310. return str(x).encode("us-ascii")
  311. elif isinstance(x, time.struct_time):
  312. return b'(D:'+time.strftime('%Y%m%d%H%M%SZ', x).encode("us-ascii")+b')'
  313. elif isinstance(x, dict):
  314. return bytes(PdfDict(x))
  315. elif isinstance(x, list):
  316. return bytes(PdfArray(x))
  317. elif ((py3 and isinstance(x, str)) or
  318. (not py3 and isinstance(x, unicode))):
  319. return pdf_repr(encode_text(x))
  320. elif isinstance(x, bytes):
  321. # XXX escape more chars? handle binary garbage
  322. x = x.replace(b"\\", b"\\\\")
  323. x = x.replace(b"(", b"\\(")
  324. x = x.replace(b")", b"\\)")
  325. return b"(" + x + b")"
  326. else:
  327. return bytes(x)
  328. class PdfParser:
  329. """Based on
  330. Supports PDF up to 1.4
  331. """
  332. def __init__(self, filename=None, f=None,
  333. buf=None, start_offset=0, mode="rb"):
  334. # type: (PdfParser, str, file, Union[bytes, bytearray], int, str)
  335. # -> None
  336. if buf and f:
  337. raise RuntimeError(
  338. "specify buf or f or filename, but not both buf and f")
  339. self.filename = filename
  340. self.buf = buf
  341. self.f = f
  342. self.start_offset = start_offset
  343. self.should_close_buf = False
  344. self.should_close_file = False
  345. if filename is not None and f is None:
  346. self.f = f = open(filename, mode)
  347. self.should_close_file = True
  348. if f is not None:
  349. self.buf = buf = self.get_buf_from_file(f)
  350. self.should_close_buf = True
  351. if not filename and hasattr(f, "name"):
  352. self.filename =
  353. self.cached_objects = {}
  354. if buf:
  355. self.read_pdf_info()
  356. else:
  357. self.file_size_total = self.file_size_this = 0
  358. self.root = PdfDict()
  359. self.root_ref = None
  360. = PdfDict()
  361. self.info_ref = None
  362. self.page_tree_root = {}
  363. self.pages = []
  364. self.orig_pages = []
  365. self.pages_ref = None
  366. self.last_xref_section_offset = None
  367. self.trailer_dict = {}
  368. self.xref_table = XrefTable()
  369. self.xref_table.reading_finished = True
  370. if f:
  371. self.seek_end()
  372. def __enter__(self):
  373. return self
  374. def __exit__(self, exc_type, exc_value, traceback):
  375. self.close()
  376. return False # do not suppress exceptions
  377. def start_writing(self):
  378. self.close_buf()
  379. self.seek_end()
  380. def close_buf(self):
  381. try:
  382. self.buf.close()
  383. except AttributeError:
  384. pass
  385. self.buf = None
  386. def close(self):
  387. if self.should_close_buf:
  388. self.close_buf()
  389. if self.f is not None and self.should_close_file:
  390. self.f.close()
  391. self.f = None
  392. def seek_end(self):
  393., os.SEEK_END)
  394. def write_header(self):
  395. self.f.write(b"%PDF-1.4\n")
  396. def write_comment(self, s):
  397. self.f.write(("%% %s\n" % (s,)).encode("utf-8"))
  398. def write_catalog(self):
  399. self.del_root()
  400. self.root_ref = self.next_object_id(self.f.tell())
  401. self.pages_ref = self.next_object_id(0)
  402. self.rewrite_pages()
  403. self.write_obj(self.root_ref,
  404. Type=PdfName(b"Catalog"),
  405. Pages=self.pages_ref)
  406. self.write_obj(self.pages_ref,
  407. Type=PdfName(b"Pages"),
  408. Count=len(self.pages),
  409. Kids=self.pages)
  410. return self.root_ref
  411. def rewrite_pages(self):
  412. pages_tree_nodes_to_delete = []
  413. for i, page_ref in enumerate(self.orig_pages):
  414. page_info = self.cached_objects[page_ref]
  415. del self.xref_table[page_ref.object_id]
  416. pages_tree_nodes_to_delete.append(page_info[PdfName(b"Parent")])
  417. if page_ref not in self.pages:
  418. # the page has been deleted
  419. continue
  420. # make dict keys into strings for passing to write_page
  421. stringified_page_info = {}
  422. for key, value in page_info.items():
  423. # key should be a PdfName
  424. stringified_page_info[key.name_as_str()] = value
  425. stringified_page_info["Parent"] = self.pages_ref
  426. new_page_ref = self.write_page(None, **stringified_page_info)
  427. for j, cur_page_ref in enumerate(self.pages):
  428. if cur_page_ref == page_ref:
  429. # replace the page reference with the new one
  430. self.pages[j] = new_page_ref
  431. # delete redundant Pages tree nodes from xref table
  432. for pages_tree_node_ref in pages_tree_nodes_to_delete:
  433. while pages_tree_node_ref:
  434. pages_tree_node = self.cached_objects[pages_tree_node_ref]
  435. if pages_tree_node_ref.object_id in self.xref_table:
  436. del self.xref_table[pages_tree_node_ref.object_id]
  437. pages_tree_node_ref = pages_tree_node.get(b"Parent", None)
  438. self.orig_pages = []
  439. def write_xref_and_trailer(self, new_root_ref=None):
  440. if new_root_ref:
  441. self.del_root()
  442. self.root_ref = new_root_ref
  443. if
  444. self.info_ref = self.write_obj(None,
  445. start_xref = self.xref_table.write(self.f)
  446. num_entries = len(self.xref_table)
  447. trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
  448. if self.last_xref_section_offset is not None:
  449. trailer_dict[b"Prev"] = self.last_xref_section_offset
  450. if
  451. trailer_dict[b"Info"] = self.info_ref
  452. self.last_xref_section_offset = start_xref
  453. self.f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) +
  454. make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
  455. def write_page(self, ref, *objs, **dict_obj):
  456. if isinstance(ref, int):
  457. ref = self.pages[ref]
  458. if "Type" not in dict_obj:
  459. dict_obj["Type"] = PdfName(b"Page")
  460. if "Parent" not in dict_obj:
  461. dict_obj["Parent"] = self.pages_ref
  462. return self.write_obj(ref, *objs, **dict_obj)
  463. def write_obj(self, ref, *objs, **dict_obj):
  464. f = self.f
  465. if ref is None:
  466. ref = self.next_object_id(f.tell())
  467. else:
  468. self.xref_table[ref.object_id] = (f.tell(), ref.generation)
  469. f.write(bytes(IndirectObjectDef(*ref)))
  470. stream = dict_obj.pop("stream", None)
  471. if stream is not None:
  472. dict_obj["Length"] = len(stream)
  473. if dict_obj:
  474. f.write(pdf_repr(dict_obj))
  475. for obj in objs:
  476. f.write(pdf_repr(obj))
  477. if stream is not None:
  478. f.write(b"stream\n")
  479. f.write(stream)
  480. f.write(b"\nendstream\n")
  481. f.write(b"endobj\n")
  482. return ref
  483. def del_root(self):
  484. if self.root_ref is None:
  485. return
  486. del self.xref_table[self.root_ref.object_id]
  487. del self.xref_table[self.root[b"Pages"].object_id]
  488. @staticmethod
  489. def get_buf_from_file(f):
  490. if hasattr(f, "getbuffer"):
  491. return f.getbuffer()
  492. elif hasattr(f, "getvalue"):
  493. return f.getvalue()
  494. else:
  495. try:
  496. return mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
  497. except ValueError: # cannot mmap an empty file
  498. return b""
  499. def read_pdf_info(self):
  500. self.file_size_total = len(self.buf)
  501. self.file_size_this = self.file_size_total - self.start_offset
  502. self.read_trailer()
  503. self.root_ref = self.trailer_dict[b"Root"]
  504. self.info_ref = self.trailer_dict.get(b"Info", None)
  505. self.root = PdfDict(self.read_indirect(self.root_ref))
  506. if self.info_ref is None:
  507. = PdfDict()
  508. else:
  509. = PdfDict(self.read_indirect(self.info_ref))
  510. check_format_condition(b"Type" in self.root, "/Type missing in Root")
  511. check_format_condition(self.root[b"Type"] == b"Catalog",
  512. "/Type in Root is not /Catalog")
  513. check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
  514. check_format_condition(isinstance(self.root[b"Pages"],
  515. IndirectReference),
  516. "/Pages in Root is not an indirect reference")
  517. self.pages_ref = self.root[b"Pages"]
  518. self.page_tree_root = self.read_indirect(self.pages_ref)
  519. self.pages = self.linearize_page_tree(self.page_tree_root)
  520. # save the original list of page references
  521. # in case the user modifies, adds or deletes some pages
  522. # and we need to rewrite the pages and their list
  523. self.orig_pages = self.pages[:]
  524. def next_object_id(self, offset=None):
  525. try:
  526. # TODO: support reuse of deleted objects
  527. reference = IndirectReference(max(self.xref_table.keys()) + 1, 0)
  528. except ValueError:
  529. reference = IndirectReference(1, 0)
  530. if offset is not None:
  531. self.xref_table[reference.object_id] = (offset, 0)
  532. return reference
  533. delimiter = br"[][()<>{}/%]"
  534. delimiter_or_ws = br"[][()<>{}/%\000\011\012\014\015\040]"
  535. whitespace = br"[\000\011\012\014\015\040]"
  536. whitespace_or_hex = br"[\000\011\012\014\015\0400-9a-fA-F]"
  537. whitespace_optional = whitespace + b"*"
  538. whitespace_mandatory = whitespace + b"+"
  539. newline_only = br"[\r\n]+"
  540. newline = whitespace_optional + newline_only + whitespace_optional
  541. re_trailer_end = re.compile(
  542. whitespace_mandatory + br"trailer" + whitespace_optional +
  543. br"\<\<(.*\>\>)" + newline + br"startxref" + newline + br"([0-9]+)" +
  544. newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
  545. re_trailer_prev = re.compile(
  546. whitespace_optional + br"trailer" + whitespace_optional +
  547. br"\<\<(.*?\>\>)" + newline + br"startxref" + newline + br"([0-9]+)" +
  548. newline + br"%%EOF" + whitespace_optional, re.DOTALL)
  549. def read_trailer(self):
  550. search_start_offset = len(self.buf) - 16384
  551. if search_start_offset < self.start_offset:
  552. search_start_offset = self.start_offset
  553. m =, search_start_offset)
  554. check_format_condition(m, "trailer end not found")
  555. # make sure we found the LAST trailer
  556. last_match = m
  557. while m:
  558. last_match = m
  559. m =, m.start()+16)
  560. if not m:
  561. m = last_match
  562. trailer_data =
  563. self.last_xref_section_offset = int(
  564. self.trailer_dict = self.interpret_trailer(trailer_data)
  565. self.xref_table = XrefTable()
  566. self.read_xref_table(xref_section_offset=self.last_xref_section_offset)
  567. if b"Prev" in self.trailer_dict:
  568. self.read_prev_trailer(self.trailer_dict[b"Prev"])
  569. def read_prev_trailer(self, xref_section_offset):
  570. trailer_offset = self.read_xref_table(
  571. xref_section_offset=xref_section_offset)
  572. m =
  573. self.buf[trailer_offset:trailer_offset+16384])
  574. check_format_condition(m, "previous trailer not found")
  575. trailer_data =
  576. check_format_condition(int( == xref_section_offset,
  577. "xref section offset in previous trailer "
  578. "doesn't match what was expected")
  579. trailer_dict = self.interpret_trailer(trailer_data)
  580. if b"Prev" in trailer_dict:
  581. self.read_prev_trailer(trailer_dict[b"Prev"])
  582. re_whitespace_optional = re.compile(whitespace_optional)
  583. re_name = re.compile(
  584. whitespace_optional + br"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?=" +
  585. delimiter_or_ws + br")")
  586. re_dict_start = re.compile(whitespace_optional + br"\<\<")
  587. re_dict_end = re.compile(
  588. whitespace_optional + br"\>\>" + whitespace_optional)
  589. @classmethod
  590. def interpret_trailer(cls, trailer_data):
  591. trailer = {}
  592. offset = 0
  593. while True:
  594. m = cls.re_name.match(trailer_data, offset)
  595. if not m:
  596. m = cls.re_dict_end.match(trailer_data, offset)
  597. check_format_condition(
  598. m and m.end() == len(trailer_data),
  599. "name not found in trailer, remaining data: " +
  600. repr(trailer_data[offset:]))
  601. break
  602. key = cls.interpret_name(
  603. value, offset = cls.get_value(trailer_data, m.end())
  604. trailer[key] = value
  605. check_format_condition(
  606. b"Size" in trailer and isinstance(trailer[b"Size"], int),
  607. "/Size not in trailer or not an integer")
  608. check_format_condition(
  609. b"Root" in trailer and
  610. isinstance(trailer[b"Root"], IndirectReference),
  611. "/Root not in trailer or not an indirect reference")
  612. return trailer
  613. re_hashes_in_name = re.compile(br"([^#]*)(#([0-9a-fA-F]{2}))?")
  614. @classmethod
  615. def interpret_name(cls, raw, as_text=False):
  616. name = b""
  617. for m in cls.re_hashes_in_name.finditer(raw):
  618. if
  619. name += + \
  620. bytearray.fromhex("us-ascii"))
  621. else:
  622. name +=
  623. if as_text:
  624. return name.decode("utf-8")
  625. else:
  626. return bytes(name)
  627. re_null = re.compile(
  628. whitespace_optional + br"null(?=" + delimiter_or_ws + br")")
  629. re_true = re.compile(
  630. whitespace_optional + br"true(?=" + delimiter_or_ws + br")")
  631. re_false = re.compile(
  632. whitespace_optional + br"false(?=" + delimiter_or_ws + br")")
  633. re_int = re.compile(
  634. whitespace_optional + br"([-+]?[0-9]+)(?=" + delimiter_or_ws + br")")
  635. re_real = re.compile(
  636. whitespace_optional + br"([-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+))(?=" +
  637. delimiter_or_ws + br")")
  638. re_array_start = re.compile(whitespace_optional + br"\[")
  639. re_array_end = re.compile(whitespace_optional + br"]")
  640. re_string_hex = re.compile(
  641. whitespace_optional + br"\<(" + whitespace_or_hex + br"*)\>")
  642. re_string_lit = re.compile(whitespace_optional + br"\(")
  643. re_indirect_reference = re.compile(
  644. whitespace_optional + br"([-+]?[0-9]+)" + whitespace_mandatory +
  645. br"([-+]?[0-9]+)" + whitespace_mandatory + br"R(?=" + delimiter_or_ws +
  646. br")")
  647. re_indirect_def_start = re.compile(
  648. whitespace_optional + br"([-+]?[0-9]+)" + whitespace_mandatory +
  649. br"([-+]?[0-9]+)" + whitespace_mandatory + br"obj(?=" +
  650. delimiter_or_ws + br")")
  651. re_indirect_def_end = re.compile(
  652. whitespace_optional + br"endobj(?=" + delimiter_or_ws + br")")
  653. re_comment = re.compile(
  654. br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*")
  655. re_stream_start = re.compile(whitespace_optional + br"stream\r?\n")
  656. re_stream_end = re.compile(
  657. whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")")
  658. @classmethod
  659. def get_value(cls, data, offset, expect_indirect=None, max_nesting=-1):
  660. if max_nesting == 0:
  661. return None, None
  662. m = cls.re_comment.match(data, offset)
  663. if m:
  664. offset = m.end()
  665. m = cls.re_indirect_def_start.match(data, offset)
  666. if m:
  667. check_format_condition(
  668. int( > 0,
  669. "indirect object definition: object ID must be greater than 0")
  670. check_format_condition(
  671. int( >= 0,
  672. "indirect object definition: generation must be non-negative")
  673. check_format_condition(
  674. expect_indirect is None or expect_indirect ==
  675. IndirectReference(int(, int(,
  676. "indirect object definition different than expected")
  677. object, offset = cls.get_value(
  678. data, m.end(), max_nesting=max_nesting-1)
  679. if offset is None:
  680. return object, None
  681. m = cls.re_indirect_def_end.match(data, offset)
  682. check_format_condition(
  683. m, "indirect object definition end not found")
  684. return object, m.end()
  685. check_format_condition(
  686. not expect_indirect, "indirect object definition not found")
  687. m = cls.re_indirect_reference.match(data, offset)
  688. if m:
  689. check_format_condition(
  690. int( > 0,
  691. "indirect object reference: object ID must be greater than 0")
  692. check_format_condition(
  693. int( >= 0,
  694. "indirect object reference: generation must be non-negative")
  695. return IndirectReference(int(, int(, m.end()
  696. m = cls.re_dict_start.match(data, offset)
  697. if m:
  698. offset = m.end()
  699. result = {}
  700. m = cls.re_dict_end.match(data, offset)
  701. while not m:
  702. key, offset = cls.get_value(
  703. data, offset, max_nesting=max_nesting-1)
  704. if offset is None:
  705. return result, None
  706. value, offset = cls.get_value(
  707. data, offset, max_nesting=max_nesting-1)
  708. result[key] = value
  709. if offset is None:
  710. return result, None
  711. m = cls.re_dict_end.match(data, offset)
  712. offset = m.end()
  713. m = cls.re_stream_start.match(data, offset)
  714. if m:
  715. try:
  716. stream_len = int(result[b"Length"])
  717. except (TypeError, KeyError, ValueError):
  718. raise PdfFormatError(
  719. "bad or missing Length in stream dict (%r)" %
  720. result.get(b"Length", None))
  721. stream_data = data[m.end():m.end() + stream_len]
  722. m = cls.re_stream_end.match(data, m.end() + stream_len)
  723. check_format_condition(m, "stream end not found")
  724. offset = m.end()
  725. result = PdfStream(PdfDict(result), stream_data)
  726. else:
  727. result = PdfDict(result)
  728. return result, offset
  729. m = cls.re_array_start.match(data, offset)
  730. if m:
  731. offset = m.end()
  732. result = []
  733. m = cls.re_array_end.match(data, offset)
  734. while not m:
  735. value, offset = cls.get_value(
  736. data, offset, max_nesting=max_nesting-1)
  737. result.append(value)
  738. if offset is None:
  739. return result, None
  740. m = cls.re_array_end.match(data, offset)
  741. return result, m.end()
  742. m = cls.re_null.match(data, offset)
  743. if m:
  744. return None, m.end()
  745. m = cls.re_true.match(data, offset)
  746. if m:
  747. return True, m.end()
  748. m = cls.re_false.match(data, offset)
  749. if m:
  750. return False, m.end()
  751. m = cls.re_name.match(data, offset)
  752. if m:
  753. return PdfName(cls.interpret_name(, m.end()
  754. m = cls.re_int.match(data, offset)
  755. if m:
  756. return int(, m.end()
  757. m = cls.re_real.match(data, offset)
  758. if m:
  759. # XXX Decimal instead of float???
  760. return float(, m.end()
  761. m = cls.re_string_hex.match(data, offset)
  762. if m:
  763. # filter out whitespace
  764. hex_string = bytearray([
  765. b for b in
  766. if b in b"0123456789abcdefABCDEF"
  767. ])
  768. if len(hex_string) % 2 == 1:
  769. # append a 0 if the length is not even - yes, at the end
  770. hex_string.append(ord(b"0"))
  771. return bytearray.fromhex(hex_string.decode("us-ascii")), m.end()
  772. m = cls.re_string_lit.match(data, offset)
  773. if m:
  774. return cls.get_literal_string(data, m.end())
  775. # return None, offset # fallback (only for debugging)
  776. raise PdfFormatError(
  777. "unrecognized object: " + repr(data[offset:offset+32]))
  778. re_lit_str_token = re.compile(br"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))")
  779. escaped_chars = {
  780. b"n": b"\n",
  781. b"r": b"\r",
  782. b"t": b"\t",
  783. b"b": b"\b",
  784. b"f": b"\f",
  785. b"(": b"(",
  786. b")": b")",
  787. b"\\": b"\\",
  788. ord(b"n"): b"\n",
  789. ord(b"r"): b"\r",
  790. ord(b"t"): b"\t",
  791. ord(b"b"): b"\b",
  792. ord(b"f"): b"\f",
  793. ord(b"("): b"(",
  794. ord(b")"): b")",
  795. ord(b"\\"): b"\\",
  796. }
  797. @classmethod
  798. def get_literal_string(cls, data, offset):
  799. nesting_depth = 0
  800. result = bytearray()
  801. for m in cls.re_lit_str_token.finditer(data, offset):
  802. result.extend(data[offset:m.start()])
  803. if
  804. result.extend(cls.escaped_chars[[1]])
  805. elif
  806. result.append(int([1:], 8))
  807. elif
  808. pass
  809. elif
  810. result.extend(b"\n")
  811. elif
  812. result.extend(b"(")
  813. nesting_depth += 1
  814. elif
  815. if nesting_depth == 0:
  816. return bytes(result), m.end()
  817. result.extend(b")")
  818. nesting_depth -= 1
  819. offset = m.end()
  820. raise PdfFormatError("unfinished literal string")
  821. re_xref_section_start = re.compile(
  822. whitespace_optional + br"xref" + newline)
  823. re_xref_subsection_start = re.compile(
  824. whitespace_optional + br"([0-9]+)" + whitespace_mandatory +
  825. br"([0-9]+)" + whitespace_optional + newline_only)
  826. re_xref_entry = re.compile(br"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)")
  827. def read_xref_table(self, xref_section_offset):
  828. subsection_found = False
  829. m = self.re_xref_section_start.match(
  830. self.buf, xref_section_offset + self.start_offset)
  831. check_format_condition(m, "xref section start not found")
  832. offset = m.end()
  833. while True:
  834. m = self.re_xref_subsection_start.match(self.buf, offset)
  835. if not m:
  836. check_format_condition(
  837. subsection_found, "xref subsection start not found")
  838. break
  839. subsection_found = True
  840. offset = m.end()
  841. first_object = int(
  842. num_objects = int(
  843. for i in range(first_object, first_object+num_objects):
  844. m = self.re_xref_entry.match(self.buf, offset)
  845. check_format_condition(m, "xref entry not found")
  846. offset = m.end()
  847. is_free = == b"f"
  848. generation = int(
  849. if not is_free:
  850. new_entry = (int(, generation)
  851. check_format_condition(
  852. i not in self.xref_table or
  853. self.xref_table[i] == new_entry,
  854. "xref entry duplicated (and not identical)")
  855. self.xref_table[i] = new_entry
  856. return offset
  857. def read_indirect(self, ref, max_nesting=-1):
  858. offset, generation = self.xref_table[ref[0]]
  859. check_format_condition(
  860. generation == ref[1],
  861. "expected to find generation %s for object ID %s in xref table, "
  862. "instead found generation %s at offset %s"
  863. % (ref[1], ref[0], generation, offset))
  864. value = self.get_value(self.buf, offset + self.start_offset,
  865. expect_indirect=IndirectReference(*ref),
  866. max_nesting=max_nesting)[0]
  867. self.cached_objects[ref] = value
  868. return value
  869. def linearize_page_tree(self, node=None):
  870. if node is None:
  871. node = self.page_tree_root
  872. check_format_condition(
  873. node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages")
  874. pages = []
  875. for kid in node[b"Kids"]:
  876. kid_object = self.read_indirect(kid)
  877. if kid_object[b"Type"] == b"Page":
  878. pages.append(kid)
  879. else:
  880. pages.extend(self.linearize_page_tree(node=kid_object))
  881. return pages