Development of an internal social media platform with personalised dashboards for students
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PdfParser.py 35KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974
  1. import calendar
  2. import codecs
  3. import collections
  4. import mmap
  5. import os
  6. import re
  7. import time
  8. import zlib
  9. from ._util import py3
  10. try:
  11. from UserDict import UserDict # Python 2.x
  12. except ImportError:
  13. UserDict = collections.UserDict # Python 3.x
  14. if py3: # Python 3.x
  15. def make_bytes(s):
  16. return s.encode("us-ascii")
  17. else: # Python 2.x
  18. def make_bytes(s): # pragma: no cover
  19. return s # pragma: no cover
  20. # see 7.9.2.2 Text String Type on page 86 and D.3 PDFDocEncoding Character Set
  21. # on page 656
  22. def encode_text(s):
  23. return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
  24. PDFDocEncoding = {
  25. 0x16: u"\u0017",
  26. 0x18: u"\u02D8",
  27. 0x19: u"\u02C7",
  28. 0x1A: u"\u02C6",
  29. 0x1B: u"\u02D9",
  30. 0x1C: u"\u02DD",
  31. 0x1D: u"\u02DB",
  32. 0x1E: u"\u02DA",
  33. 0x1F: u"\u02DC",
  34. 0x80: u"\u2022",
  35. 0x81: u"\u2020",
  36. 0x82: u"\u2021",
  37. 0x83: u"\u2026",
  38. 0x84: u"\u2014",
  39. 0x85: u"\u2013",
  40. 0x86: u"\u0192",
  41. 0x87: u"\u2044",
  42. 0x88: u"\u2039",
  43. 0x89: u"\u203A",
  44. 0x8A: u"\u2212",
  45. 0x8B: u"\u2030",
  46. 0x8C: u"\u201E",
  47. 0x8D: u"\u201C",
  48. 0x8E: u"\u201D",
  49. 0x8F: u"\u2018",
  50. 0x90: u"\u2019",
  51. 0x91: u"\u201A",
  52. 0x92: u"\u2122",
  53. 0x93: u"\uFB01",
  54. 0x94: u"\uFB02",
  55. 0x95: u"\u0141",
  56. 0x96: u"\u0152",
  57. 0x97: u"\u0160",
  58. 0x98: u"\u0178",
  59. 0x99: u"\u017D",
  60. 0x9A: u"\u0131",
  61. 0x9B: u"\u0142",
  62. 0x9C: u"\u0153",
  63. 0x9D: u"\u0161",
  64. 0x9E: u"\u017E",
  65. 0xA0: u"\u20AC",
  66. }
  67. def decode_text(b):
  68. if b[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
  69. return b[len(codecs.BOM_UTF16_BE):].decode("utf_16_be")
  70. elif py3: # Python 3.x
  71. return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b)
  72. else: # Python 2.x
  73. return u"".join(PDFDocEncoding.get(ord(byte), byte) for byte in b)
  74. class PdfFormatError(RuntimeError):
  75. """An error that probably indicates a syntactic or semantic error in the
  76. PDF file structure"""
  77. pass
  78. def check_format_condition(condition, error_message):
  79. if not condition:
  80. raise PdfFormatError(error_message)
  81. class IndirectReference(collections.namedtuple("IndirectReferenceTuple",
  82. ["object_id", "generation"])):
  83. def __str__(self):
  84. return "%s %s R" % self
  85. def __bytes__(self):
  86. return self.__str__().encode("us-ascii")
  87. def __eq__(self, other):
  88. return other.__class__ is self.__class__ and \
  89. other.object_id == self.object_id and \
  90. other.generation == self.generation
  91. def __ne__(self, other):
  92. return not (self == other)
  93. def __hash__(self):
  94. return hash((self.object_id, self.generation))
  95. class IndirectObjectDef(IndirectReference):
  96. def __str__(self):
  97. return "%s %s obj" % self
  98. class XrefTable:
  99. def __init__(self):
  100. self.existing_entries = {} # object ID => (offset, generation)
  101. self.new_entries = {} # object ID => (offset, generation)
  102. self.deleted_entries = {0: 65536} # object ID => generation
  103. self.reading_finished = False
  104. def __setitem__(self, key, value):
  105. if self.reading_finished:
  106. self.new_entries[key] = value
  107. else:
  108. self.existing_entries[key] = value
  109. if key in self.deleted_entries:
  110. del self.deleted_entries[key]
  111. def __getitem__(self, key):
  112. try:
  113. return self.new_entries[key]
  114. except KeyError:
  115. return self.existing_entries[key]
  116. def __delitem__(self, key):
  117. if key in self.new_entries:
  118. generation = self.new_entries[key][1] + 1
  119. del self.new_entries[key]
  120. self.deleted_entries[key] = generation
  121. elif key in self.existing_entries:
  122. generation = self.existing_entries[key][1] + 1
  123. self.deleted_entries[key] = generation
  124. elif key in self.deleted_entries:
  125. generation = self.deleted_entries[key]
  126. else:
  127. raise IndexError("object ID " + str(key) +
  128. " cannot be deleted because it doesn't exist")
  129. def __contains__(self, key):
  130. return key in self.existing_entries or key in self.new_entries
  131. def __len__(self):
  132. return len(set(self.existing_entries.keys()) |
  133. set(self.new_entries.keys()) |
  134. set(self.deleted_entries.keys()))
  135. def keys(self):
  136. return (
  137. set(self.existing_entries.keys()) -
  138. set(self.deleted_entries.keys())
  139. ) | set(self.new_entries.keys())
  140. def write(self, f):
  141. keys = sorted(set(self.new_entries.keys()) |
  142. set(self.deleted_entries.keys()))
  143. deleted_keys = sorted(set(self.deleted_entries.keys()))
  144. startxref = f.tell()
  145. f.write(b"xref\n")
  146. while keys:
  147. # find a contiguous sequence of object IDs
  148. prev = None
  149. for index, key in enumerate(keys):
  150. if prev is None or prev+1 == key:
  151. prev = key
  152. else:
  153. contiguous_keys = keys[:index]
  154. keys = keys[index:]
  155. break
  156. else:
  157. contiguous_keys = keys
  158. keys = None
  159. f.write(make_bytes("%d %d\n" %
  160. (contiguous_keys[0], len(contiguous_keys))))
  161. for object_id in contiguous_keys:
  162. if object_id in self.new_entries:
  163. f.write(make_bytes("%010d %05d n \n" %
  164. self.new_entries[object_id]))
  165. else:
  166. this_deleted_object_id = deleted_keys.pop(0)
  167. check_format_condition(object_id == this_deleted_object_id,
  168. "expected the next deleted object "
  169. "ID to be %s, instead found %s" %
  170. (object_id, this_deleted_object_id))
  171. try:
  172. next_in_linked_list = deleted_keys[0]
  173. except IndexError:
  174. next_in_linked_list = 0
  175. f.write(make_bytes("%010d %05d f \n" %
  176. (next_in_linked_list,
  177. self.deleted_entries[object_id])))
  178. return startxref
  179. class PdfName:
  180. def __init__(self, name):
  181. if isinstance(name, PdfName):
  182. self.name = name.name
  183. elif isinstance(name, bytes):
  184. self.name = name
  185. else:
  186. self.name = name.encode("us-ascii")
  187. def name_as_str(self):
  188. return self.name.decode("us-ascii")
  189. def __eq__(self, other):
  190. return (isinstance(other, PdfName) and other.name == self.name) or \
  191. other == self.name
  192. def __hash__(self):
  193. return hash(self.name)
  194. def __repr__(self):
  195. return "PdfName(%s)" % repr(self.name)
  196. @classmethod
  197. def from_pdf_stream(cls, data):
  198. return cls(PdfParser.interpret_name(data))
  199. allowed_chars = set(range(33, 127)) - set(ord(c) for c in "#%/()<>[]{}")
  200. def __bytes__(self):
  201. result = bytearray(b"/")
  202. for b in self.name:
  203. if py3: # Python 3.x
  204. if b in self.allowed_chars:
  205. result.append(b)
  206. else:
  207. result.extend(make_bytes("#%02X" % b))
  208. else: # Python 2.x
  209. if ord(b) in self.allowed_chars:
  210. result.append(b)
  211. else:
  212. result.extend(b"#%02X" % ord(b))
  213. return bytes(result)
  214. __str__ = __bytes__
  215. class PdfArray(list):
  216. def __bytes__(self):
  217. return b"[ " + b" ".join(pdf_repr(x) for x in self) + b" ]"
  218. __str__ = __bytes__
  219. class PdfDict(UserDict):
  220. def __setattr__(self, key, value):
  221. if key == "data":
  222. if hasattr(UserDict, "__setattr__"):
  223. UserDict.__setattr__(self, key, value)
  224. else:
  225. self.__dict__[key] = value
  226. else:
  227. if isinstance(key, str):
  228. key = key.encode("us-ascii")
  229. self[key] = value
  230. def __getattr__(self, key):
  231. try:
  232. value = self[key]
  233. except KeyError:
  234. try:
  235. value = self[key.encode("us-ascii")]
  236. except KeyError:
  237. raise AttributeError(key)
  238. if isinstance(value, bytes):
  239. value = decode_text(value)
  240. if key.endswith("Date"):
  241. if value.startswith("D:"):
  242. value = value[2:]
  243. relationship = 'Z'
  244. if len(value) > 17:
  245. relationship = value[14]
  246. offset = int(value[15:17]) * 60
  247. if len(value) > 20:
  248. offset += int(value[18:20])
  249. format = '%Y%m%d%H%M%S'[:len(value) - 2]
  250. value = time.strptime(value[:len(format)+2], format)
  251. if relationship in ['+', '-']:
  252. offset *= 60
  253. if relationship == '+':
  254. offset *= -1
  255. value = time.gmtime(calendar.timegm(value) + offset)
  256. return value
  257. def __bytes__(self):
  258. out = bytearray(b"<<")
  259. for key, value in self.items():
  260. if value is None:
  261. continue
  262. value = pdf_repr(value)
  263. out.extend(b"\n")
  264. out.extend(bytes(PdfName(key)))
  265. out.extend(b" ")
  266. out.extend(value)
  267. out.extend(b"\n>>")
  268. return bytes(out)
  269. if not py3:
  270. __str__ = __bytes__
  271. class PdfBinary:
  272. def __init__(self, data):
  273. self.data = data
  274. if py3: # Python 3.x
  275. def __bytes__(self):
  276. return make_bytes("<%s>" % "".join("%02X" % b for b in self.data))
  277. else: # Python 2.x
  278. def __str__(self):
  279. return "<%s>" % "".join("%02X" % ord(b) for b in self.data)
  280. class PdfStream:
  281. def __init__(self, dictionary, buf):
  282. self.dictionary = dictionary
  283. self.buf = buf
  284. def decode(self):
  285. try:
  286. filter = self.dictionary.Filter
  287. except AttributeError:
  288. return self.buf
  289. if filter == b"FlateDecode":
  290. try:
  291. expected_length = self.dictionary.DL
  292. except AttributeError:
  293. expected_length = self.dictionary.Length
  294. return zlib.decompress(self.buf, bufsize=int(expected_length))
  295. else:
  296. raise NotImplementedError(
  297. "stream filter %s unknown/unsupported" %
  298. repr(self.dictionary.Filter))
  299. def pdf_repr(x):
  300. if x is True:
  301. return b"true"
  302. elif x is False:
  303. return b"false"
  304. elif x is None:
  305. return b"null"
  306. elif (isinstance(x, PdfName) or isinstance(x, PdfDict) or
  307. isinstance(x, PdfArray) or isinstance(x, PdfBinary)):
  308. return bytes(x)
  309. elif isinstance(x, int):
  310. return str(x).encode("us-ascii")
  311. elif isinstance(x, time.struct_time):
  312. return b'(D:'+time.strftime('%Y%m%d%H%M%SZ', x).encode("us-ascii")+b')'
  313. elif isinstance(x, dict):
  314. return bytes(PdfDict(x))
  315. elif isinstance(x, list):
  316. return bytes(PdfArray(x))
  317. elif ((py3 and isinstance(x, str)) or
  318. (not py3 and isinstance(x, unicode))):
  319. return pdf_repr(encode_text(x))
  320. elif isinstance(x, bytes):
  321. # XXX escape more chars? handle binary garbage
  322. x = x.replace(b"\\", b"\\\\")
  323. x = x.replace(b"(", b"\\(")
  324. x = x.replace(b")", b"\\)")
  325. return b"(" + x + b")"
  326. else:
  327. return bytes(x)
  328. class PdfParser:
  329. """Based on https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
  330. Supports PDF up to 1.4
  331. """
  332. def __init__(self, filename=None, f=None,
  333. buf=None, start_offset=0, mode="rb"):
  334. # type: (PdfParser, str, file, Union[bytes, bytearray], int, str)
  335. # -> None
  336. if buf and f:
  337. raise RuntimeError(
  338. "specify buf or f or filename, but not both buf and f")
  339. self.filename = filename
  340. self.buf = buf
  341. self.f = f
  342. self.start_offset = start_offset
  343. self.should_close_buf = False
  344. self.should_close_file = False
  345. if filename is not None and f is None:
  346. self.f = f = open(filename, mode)
  347. self.should_close_file = True
  348. if f is not None:
  349. self.buf = buf = self.get_buf_from_file(f)
  350. self.should_close_buf = True
  351. if not filename and hasattr(f, "name"):
  352. self.filename = f.name
  353. self.cached_objects = {}
  354. if buf:
  355. self.read_pdf_info()
  356. else:
  357. self.file_size_total = self.file_size_this = 0
  358. self.root = PdfDict()
  359. self.root_ref = None
  360. self.info = PdfDict()
  361. self.info_ref = None
  362. self.page_tree_root = {}
  363. self.pages = []
  364. self.orig_pages = []
  365. self.pages_ref = None
  366. self.last_xref_section_offset = None
  367. self.trailer_dict = {}
  368. self.xref_table = XrefTable()
  369. self.xref_table.reading_finished = True
  370. if f:
  371. self.seek_end()
  372. def __enter__(self):
  373. return self
  374. def __exit__(self, exc_type, exc_value, traceback):
  375. self.close()
  376. return False # do not suppress exceptions
  377. def start_writing(self):
  378. self.close_buf()
  379. self.seek_end()
  380. def close_buf(self):
  381. try:
  382. self.buf.close()
  383. except AttributeError:
  384. pass
  385. self.buf = None
  386. def close(self):
  387. if self.should_close_buf:
  388. self.close_buf()
  389. if self.f is not None and self.should_close_file:
  390. self.f.close()
  391. self.f = None
  392. def seek_end(self):
  393. self.f.seek(0, os.SEEK_END)
  394. def write_header(self):
  395. self.f.write(b"%PDF-1.4\n")
  396. def write_comment(self, s):
  397. self.f.write(("%% %s\n" % (s,)).encode("utf-8"))
  398. def write_catalog(self):
  399. self.del_root()
  400. self.root_ref = self.next_object_id(self.f.tell())
  401. self.pages_ref = self.next_object_id(0)
  402. self.rewrite_pages()
  403. self.write_obj(self.root_ref,
  404. Type=PdfName(b"Catalog"),
  405. Pages=self.pages_ref)
  406. self.write_obj(self.pages_ref,
  407. Type=PdfName(b"Pages"),
  408. Count=len(self.pages),
  409. Kids=self.pages)
  410. return self.root_ref
  411. def rewrite_pages(self):
  412. pages_tree_nodes_to_delete = []
  413. for i, page_ref in enumerate(self.orig_pages):
  414. page_info = self.cached_objects[page_ref]
  415. del self.xref_table[page_ref.object_id]
  416. pages_tree_nodes_to_delete.append(page_info[PdfName(b"Parent")])
  417. if page_ref not in self.pages:
  418. # the page has been deleted
  419. continue
  420. # make dict keys into strings for passing to write_page
  421. stringified_page_info = {}
  422. for key, value in page_info.items():
  423. # key should be a PdfName
  424. stringified_page_info[key.name_as_str()] = value
  425. stringified_page_info["Parent"] = self.pages_ref
  426. new_page_ref = self.write_page(None, **stringified_page_info)
  427. for j, cur_page_ref in enumerate(self.pages):
  428. if cur_page_ref == page_ref:
  429. # replace the page reference with the new one
  430. self.pages[j] = new_page_ref
  431. # delete redundant Pages tree nodes from xref table
  432. for pages_tree_node_ref in pages_tree_nodes_to_delete:
  433. while pages_tree_node_ref:
  434. pages_tree_node = self.cached_objects[pages_tree_node_ref]
  435. if pages_tree_node_ref.object_id in self.xref_table:
  436. del self.xref_table[pages_tree_node_ref.object_id]
  437. pages_tree_node_ref = pages_tree_node.get(b"Parent", None)
  438. self.orig_pages = []
  439. def write_xref_and_trailer(self, new_root_ref=None):
  440. if new_root_ref:
  441. self.del_root()
  442. self.root_ref = new_root_ref
  443. if self.info:
  444. self.info_ref = self.write_obj(None, self.info)
  445. start_xref = self.xref_table.write(self.f)
  446. num_entries = len(self.xref_table)
  447. trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
  448. if self.last_xref_section_offset is not None:
  449. trailer_dict[b"Prev"] = self.last_xref_section_offset
  450. if self.info:
  451. trailer_dict[b"Info"] = self.info_ref
  452. self.last_xref_section_offset = start_xref
  453. self.f.write(b"trailer\n" + bytes(PdfDict(trailer_dict)) +
  454. make_bytes("\nstartxref\n%d\n%%%%EOF" % start_xref))
  455. def write_page(self, ref, *objs, **dict_obj):
  456. if isinstance(ref, int):
  457. ref = self.pages[ref]
  458. if "Type" not in dict_obj:
  459. dict_obj["Type"] = PdfName(b"Page")
  460. if "Parent" not in dict_obj:
  461. dict_obj["Parent"] = self.pages_ref
  462. return self.write_obj(ref, *objs, **dict_obj)
  463. def write_obj(self, ref, *objs, **dict_obj):
  464. f = self.f
  465. if ref is None:
  466. ref = self.next_object_id(f.tell())
  467. else:
  468. self.xref_table[ref.object_id] = (f.tell(), ref.generation)
  469. f.write(bytes(IndirectObjectDef(*ref)))
  470. stream = dict_obj.pop("stream", None)
  471. if stream is not None:
  472. dict_obj["Length"] = len(stream)
  473. if dict_obj:
  474. f.write(pdf_repr(dict_obj))
  475. for obj in objs:
  476. f.write(pdf_repr(obj))
  477. if stream is not None:
  478. f.write(b"stream\n")
  479. f.write(stream)
  480. f.write(b"\nendstream\n")
  481. f.write(b"endobj\n")
  482. return ref
  483. def del_root(self):
  484. if self.root_ref is None:
  485. return
  486. del self.xref_table[self.root_ref.object_id]
  487. del self.xref_table[self.root[b"Pages"].object_id]
  488. @staticmethod
  489. def get_buf_from_file(f):
  490. if hasattr(f, "getbuffer"):
  491. return f.getbuffer()
  492. elif hasattr(f, "getvalue"):
  493. return f.getvalue()
  494. else:
  495. try:
  496. return mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
  497. except ValueError: # cannot mmap an empty file
  498. return b""
  499. def read_pdf_info(self):
  500. self.file_size_total = len(self.buf)
  501. self.file_size_this = self.file_size_total - self.start_offset
  502. self.read_trailer()
  503. self.root_ref = self.trailer_dict[b"Root"]
  504. self.info_ref = self.trailer_dict.get(b"Info", None)
  505. self.root = PdfDict(self.read_indirect(self.root_ref))
  506. if self.info_ref is None:
  507. self.info = PdfDict()
  508. else:
  509. self.info = PdfDict(self.read_indirect(self.info_ref))
  510. check_format_condition(b"Type" in self.root, "/Type missing in Root")
  511. check_format_condition(self.root[b"Type"] == b"Catalog",
  512. "/Type in Root is not /Catalog")
  513. check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
  514. check_format_condition(isinstance(self.root[b"Pages"],
  515. IndirectReference),
  516. "/Pages in Root is not an indirect reference")
  517. self.pages_ref = self.root[b"Pages"]
  518. self.page_tree_root = self.read_indirect(self.pages_ref)
  519. self.pages = self.linearize_page_tree(self.page_tree_root)
  520. # save the original list of page references
  521. # in case the user modifies, adds or deletes some pages
  522. # and we need to rewrite the pages and their list
  523. self.orig_pages = self.pages[:]
  524. def next_object_id(self, offset=None):
  525. try:
  526. # TODO: support reuse of deleted objects
  527. reference = IndirectReference(max(self.xref_table.keys()) + 1, 0)
  528. except ValueError:
  529. reference = IndirectReference(1, 0)
  530. if offset is not None:
  531. self.xref_table[reference.object_id] = (offset, 0)
  532. return reference
  533. delimiter = br"[][()<>{}/%]"
  534. delimiter_or_ws = br"[][()<>{}/%\000\011\012\014\015\040]"
  535. whitespace = br"[\000\011\012\014\015\040]"
  536. whitespace_or_hex = br"[\000\011\012\014\015\0400-9a-fA-F]"
  537. whitespace_optional = whitespace + b"*"
  538. whitespace_mandatory = whitespace + b"+"
  539. newline_only = br"[\r\n]+"
  540. newline = whitespace_optional + newline_only + whitespace_optional
  541. re_trailer_end = re.compile(
  542. whitespace_mandatory + br"trailer" + whitespace_optional +
  543. br"\<\<(.*\>\>)" + newline + br"startxref" + newline + br"([0-9]+)" +
  544. newline + br"%%EOF" + whitespace_optional + br"$", re.DOTALL)
  545. re_trailer_prev = re.compile(
  546. whitespace_optional + br"trailer" + whitespace_optional +
  547. br"\<\<(.*?\>\>)" + newline + br"startxref" + newline + br"([0-9]+)" +
  548. newline + br"%%EOF" + whitespace_optional, re.DOTALL)
  549. def read_trailer(self):
  550. search_start_offset = len(self.buf) - 16384
  551. if search_start_offset < self.start_offset:
  552. search_start_offset = self.start_offset
  553. m = self.re_trailer_end.search(self.buf, search_start_offset)
  554. check_format_condition(m, "trailer end not found")
  555. # make sure we found the LAST trailer
  556. last_match = m
  557. while m:
  558. last_match = m
  559. m = self.re_trailer_end.search(self.buf, m.start()+16)
  560. if not m:
  561. m = last_match
  562. trailer_data = m.group(1)
  563. self.last_xref_section_offset = int(m.group(2))
  564. self.trailer_dict = self.interpret_trailer(trailer_data)
  565. self.xref_table = XrefTable()
  566. self.read_xref_table(xref_section_offset=self.last_xref_section_offset)
  567. if b"Prev" in self.trailer_dict:
  568. self.read_prev_trailer(self.trailer_dict[b"Prev"])
  569. def read_prev_trailer(self, xref_section_offset):
  570. trailer_offset = self.read_xref_table(
  571. xref_section_offset=xref_section_offset)
  572. m = self.re_trailer_prev.search(
  573. self.buf[trailer_offset:trailer_offset+16384])
  574. check_format_condition(m, "previous trailer not found")
  575. trailer_data = m.group(1)
  576. check_format_condition(int(m.group(2)) == xref_section_offset,
  577. "xref section offset in previous trailer "
  578. "doesn't match what was expected")
  579. trailer_dict = self.interpret_trailer(trailer_data)
  580. if b"Prev" in trailer_dict:
  581. self.read_prev_trailer(trailer_dict[b"Prev"])
  582. re_whitespace_optional = re.compile(whitespace_optional)
  583. re_name = re.compile(
  584. whitespace_optional + br"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?=" +
  585. delimiter_or_ws + br")")
  586. re_dict_start = re.compile(whitespace_optional + br"\<\<")
  587. re_dict_end = re.compile(
  588. whitespace_optional + br"\>\>" + whitespace_optional)
  589. @classmethod
  590. def interpret_trailer(cls, trailer_data):
  591. trailer = {}
  592. offset = 0
  593. while True:
  594. m = cls.re_name.match(trailer_data, offset)
  595. if not m:
  596. m = cls.re_dict_end.match(trailer_data, offset)
  597. check_format_condition(
  598. m and m.end() == len(trailer_data),
  599. "name not found in trailer, remaining data: " +
  600. repr(trailer_data[offset:]))
  601. break
  602. key = cls.interpret_name(m.group(1))
  603. value, offset = cls.get_value(trailer_data, m.end())
  604. trailer[key] = value
  605. check_format_condition(
  606. b"Size" in trailer and isinstance(trailer[b"Size"], int),
  607. "/Size not in trailer or not an integer")
  608. check_format_condition(
  609. b"Root" in trailer and
  610. isinstance(trailer[b"Root"], IndirectReference),
  611. "/Root not in trailer or not an indirect reference")
  612. return trailer
  613. re_hashes_in_name = re.compile(br"([^#]*)(#([0-9a-fA-F]{2}))?")
  614. @classmethod
  615. def interpret_name(cls, raw, as_text=False):
  616. name = b""
  617. for m in cls.re_hashes_in_name.finditer(raw):
  618. if m.group(3):
  619. name += m.group(1) + \
  620. bytearray.fromhex(m.group(3).decode("us-ascii"))
  621. else:
  622. name += m.group(1)
  623. if as_text:
  624. return name.decode("utf-8")
  625. else:
  626. return bytes(name)
  627. re_null = re.compile(
  628. whitespace_optional + br"null(?=" + delimiter_or_ws + br")")
  629. re_true = re.compile(
  630. whitespace_optional + br"true(?=" + delimiter_or_ws + br")")
  631. re_false = re.compile(
  632. whitespace_optional + br"false(?=" + delimiter_or_ws + br")")
  633. re_int = re.compile(
  634. whitespace_optional + br"([-+]?[0-9]+)(?=" + delimiter_or_ws + br")")
  635. re_real = re.compile(
  636. whitespace_optional + br"([-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+))(?=" +
  637. delimiter_or_ws + br")")
  638. re_array_start = re.compile(whitespace_optional + br"\[")
  639. re_array_end = re.compile(whitespace_optional + br"]")
  640. re_string_hex = re.compile(
  641. whitespace_optional + br"\<(" + whitespace_or_hex + br"*)\>")
  642. re_string_lit = re.compile(whitespace_optional + br"\(")
  643. re_indirect_reference = re.compile(
  644. whitespace_optional + br"([-+]?[0-9]+)" + whitespace_mandatory +
  645. br"([-+]?[0-9]+)" + whitespace_mandatory + br"R(?=" + delimiter_or_ws +
  646. br")")
  647. re_indirect_def_start = re.compile(
  648. whitespace_optional + br"([-+]?[0-9]+)" + whitespace_mandatory +
  649. br"([-+]?[0-9]+)" + whitespace_mandatory + br"obj(?=" +
  650. delimiter_or_ws + br")")
  651. re_indirect_def_end = re.compile(
  652. whitespace_optional + br"endobj(?=" + delimiter_or_ws + br")")
  653. re_comment = re.compile(
  654. br"(" + whitespace_optional + br"%[^\r\n]*" + newline + br")*")
  655. re_stream_start = re.compile(whitespace_optional + br"stream\r?\n")
  656. re_stream_end = re.compile(
  657. whitespace_optional + br"endstream(?=" + delimiter_or_ws + br")")
  658. @classmethod
  659. def get_value(cls, data, offset, expect_indirect=None, max_nesting=-1):
  660. if max_nesting == 0:
  661. return None, None
  662. m = cls.re_comment.match(data, offset)
  663. if m:
  664. offset = m.end()
  665. m = cls.re_indirect_def_start.match(data, offset)
  666. if m:
  667. check_format_condition(
  668. int(m.group(1)) > 0,
  669. "indirect object definition: object ID must be greater than 0")
  670. check_format_condition(
  671. int(m.group(2)) >= 0,
  672. "indirect object definition: generation must be non-negative")
  673. check_format_condition(
  674. expect_indirect is None or expect_indirect ==
  675. IndirectReference(int(m.group(1)), int(m.group(2))),
  676. "indirect object definition different than expected")
  677. object, offset = cls.get_value(
  678. data, m.end(), max_nesting=max_nesting-1)
  679. if offset is None:
  680. return object, None
  681. m = cls.re_indirect_def_end.match(data, offset)
  682. check_format_condition(
  683. m, "indirect object definition end not found")
  684. return object, m.end()
  685. check_format_condition(
  686. not expect_indirect, "indirect object definition not found")
  687. m = cls.re_indirect_reference.match(data, offset)
  688. if m:
  689. check_format_condition(
  690. int(m.group(1)) > 0,
  691. "indirect object reference: object ID must be greater than 0")
  692. check_format_condition(
  693. int(m.group(2)) >= 0,
  694. "indirect object reference: generation must be non-negative")
  695. return IndirectReference(int(m.group(1)), int(m.group(2))), m.end()
  696. m = cls.re_dict_start.match(data, offset)
  697. if m:
  698. offset = m.end()
  699. result = {}
  700. m = cls.re_dict_end.match(data, offset)
  701. while not m:
  702. key, offset = cls.get_value(
  703. data, offset, max_nesting=max_nesting-1)
  704. if offset is None:
  705. return result, None
  706. value, offset = cls.get_value(
  707. data, offset, max_nesting=max_nesting-1)
  708. result[key] = value
  709. if offset is None:
  710. return result, None
  711. m = cls.re_dict_end.match(data, offset)
  712. offset = m.end()
  713. m = cls.re_stream_start.match(data, offset)
  714. if m:
  715. try:
  716. stream_len = int(result[b"Length"])
  717. except (TypeError, KeyError, ValueError):
  718. raise PdfFormatError(
  719. "bad or missing Length in stream dict (%r)" %
  720. result.get(b"Length", None))
  721. stream_data = data[m.end():m.end() + stream_len]
  722. m = cls.re_stream_end.match(data, m.end() + stream_len)
  723. check_format_condition(m, "stream end not found")
  724. offset = m.end()
  725. result = PdfStream(PdfDict(result), stream_data)
  726. else:
  727. result = PdfDict(result)
  728. return result, offset
  729. m = cls.re_array_start.match(data, offset)
  730. if m:
  731. offset = m.end()
  732. result = []
  733. m = cls.re_array_end.match(data, offset)
  734. while not m:
  735. value, offset = cls.get_value(
  736. data, offset, max_nesting=max_nesting-1)
  737. result.append(value)
  738. if offset is None:
  739. return result, None
  740. m = cls.re_array_end.match(data, offset)
  741. return result, m.end()
  742. m = cls.re_null.match(data, offset)
  743. if m:
  744. return None, m.end()
  745. m = cls.re_true.match(data, offset)
  746. if m:
  747. return True, m.end()
  748. m = cls.re_false.match(data, offset)
  749. if m:
  750. return False, m.end()
  751. m = cls.re_name.match(data, offset)
  752. if m:
  753. return PdfName(cls.interpret_name(m.group(1))), m.end()
  754. m = cls.re_int.match(data, offset)
  755. if m:
  756. return int(m.group(1)), m.end()
  757. m = cls.re_real.match(data, offset)
  758. if m:
  759. # XXX Decimal instead of float???
  760. return float(m.group(1)), m.end()
  761. m = cls.re_string_hex.match(data, offset)
  762. if m:
  763. # filter out whitespace
  764. hex_string = bytearray([
  765. b for b in m.group(1)
  766. if b in b"0123456789abcdefABCDEF"
  767. ])
  768. if len(hex_string) % 2 == 1:
  769. # append a 0 if the length is not even - yes, at the end
  770. hex_string.append(ord(b"0"))
  771. return bytearray.fromhex(hex_string.decode("us-ascii")), m.end()
  772. m = cls.re_string_lit.match(data, offset)
  773. if m:
  774. return cls.get_literal_string(data, m.end())
  775. # return None, offset # fallback (only for debugging)
  776. raise PdfFormatError(
  777. "unrecognized object: " + repr(data[offset:offset+32]))
  778. re_lit_str_token = re.compile(br"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))")
  779. escaped_chars = {
  780. b"n": b"\n",
  781. b"r": b"\r",
  782. b"t": b"\t",
  783. b"b": b"\b",
  784. b"f": b"\f",
  785. b"(": b"(",
  786. b")": b")",
  787. b"\\": b"\\",
  788. ord(b"n"): b"\n",
  789. ord(b"r"): b"\r",
  790. ord(b"t"): b"\t",
  791. ord(b"b"): b"\b",
  792. ord(b"f"): b"\f",
  793. ord(b"("): b"(",
  794. ord(b")"): b")",
  795. ord(b"\\"): b"\\",
  796. }
  797. @classmethod
  798. def get_literal_string(cls, data, offset):
  799. nesting_depth = 0
  800. result = bytearray()
  801. for m in cls.re_lit_str_token.finditer(data, offset):
  802. result.extend(data[offset:m.start()])
  803. if m.group(1):
  804. result.extend(cls.escaped_chars[m.group(1)[1]])
  805. elif m.group(2):
  806. result.append(int(m.group(2)[1:], 8))
  807. elif m.group(3):
  808. pass
  809. elif m.group(5):
  810. result.extend(b"\n")
  811. elif m.group(6):
  812. result.extend(b"(")
  813. nesting_depth += 1
  814. elif m.group(7):
  815. if nesting_depth == 0:
  816. return bytes(result), m.end()
  817. result.extend(b")")
  818. nesting_depth -= 1
  819. offset = m.end()
  820. raise PdfFormatError("unfinished literal string")
  821. re_xref_section_start = re.compile(
  822. whitespace_optional + br"xref" + newline)
  823. re_xref_subsection_start = re.compile(
  824. whitespace_optional + br"([0-9]+)" + whitespace_mandatory +
  825. br"([0-9]+)" + whitespace_optional + newline_only)
  826. re_xref_entry = re.compile(br"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)")
  827. def read_xref_table(self, xref_section_offset):
  828. subsection_found = False
  829. m = self.re_xref_section_start.match(
  830. self.buf, xref_section_offset + self.start_offset)
  831. check_format_condition(m, "xref section start not found")
  832. offset = m.end()
  833. while True:
  834. m = self.re_xref_subsection_start.match(self.buf, offset)
  835. if not m:
  836. check_format_condition(
  837. subsection_found, "xref subsection start not found")
  838. break
  839. subsection_found = True
  840. offset = m.end()
  841. first_object = int(m.group(1))
  842. num_objects = int(m.group(2))
  843. for i in range(first_object, first_object+num_objects):
  844. m = self.re_xref_entry.match(self.buf, offset)
  845. check_format_condition(m, "xref entry not found")
  846. offset = m.end()
  847. is_free = m.group(3) == b"f"
  848. generation = int(m.group(2))
  849. if not is_free:
  850. new_entry = (int(m.group(1)), generation)
  851. check_format_condition(
  852. i not in self.xref_table or
  853. self.xref_table[i] == new_entry,
  854. "xref entry duplicated (and not identical)")
  855. self.xref_table[i] = new_entry
  856. return offset
  857. def read_indirect(self, ref, max_nesting=-1):
  858. offset, generation = self.xref_table[ref[0]]
  859. check_format_condition(
  860. generation == ref[1],
  861. "expected to find generation %s for object ID %s in xref table, "
  862. "instead found generation %s at offset %s"
  863. % (ref[1], ref[0], generation, offset))
  864. value = self.get_value(self.buf, offset + self.start_offset,
  865. expect_indirect=IndirectReference(*ref),
  866. max_nesting=max_nesting)[0]
  867. self.cached_objects[ref] = value
  868. return value
  869. def linearize_page_tree(self, node=None):
  870. if node is None:
  871. node = self.page_tree_root
  872. check_format_condition(
  873. node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages")
  874. pages = []
  875. for kid in node[b"Kids"]:
  876. kid_object = self.read_indirect(kid)
  877. if kid_object[b"Type"] == b"Page":
  878. pages.append(kid)
  879. else:
  880. pages.extend(self.linearize_page_tree(node=kid_object))
  881. return pages