Development of an internal social media platform with personalised dashboards for students
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizer.py 2.3KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. """
  2. ldap.schema.tokenizer - Low-level parsing functions for schema element strings
  3. See https://www.python-ldap.org/ for details.
  4. """
  5. import re
  6. TOKENS_FINDALL = re.compile(
  7. r"(\()" # opening parenthesis
  8. r"|" # or
  9. r"(\))" # closing parenthesis
  10. r"|" # or
  11. r"([^'$()\s]+)" # string of length >= 1 without '$() or whitespace
  12. r"|" # or
  13. r"('.*?'(?!\w))" # any string or empty string surrounded by single quotes
  14. # except if right quote is succeeded by alphanumeric char
  15. r"|" # or
  16. r"([^\s]+?)", # residue, all non-whitespace strings
  17. ).findall
  18. def split_tokens(s):
  19. """
  20. Returns list of syntax elements with quotes and spaces stripped.
  21. """
  22. parts = []
  23. parens = 0
  24. for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s):
  25. if unquoted:
  26. parts.append(unquoted)
  27. elif quoted:
  28. parts.append(quoted[1:-1])
  29. elif opar:
  30. parens += 1
  31. parts.append(opar)
  32. elif cpar:
  33. parens -= 1
  34. parts.append(cpar)
  35. elif residue == '$':
  36. if not parens:
  37. raise ValueError("'$' outside parenthesis in %r" % (s))
  38. else:
  39. raise ValueError(residue, s)
  40. if parens:
  41. raise ValueError("Unbalanced parenthesis in %r" % (s))
  42. return parts
  43. def extract_tokens(l,known_tokens):
  44. """
  45. Returns dictionary of known tokens with all values
  46. """
  47. assert l[0].strip()=="(" and l[-1].strip()==")",ValueError(l)
  48. result = {}
  49. result.update(known_tokens)
  50. i = 0
  51. l_len = len(l)
  52. while i<l_len:
  53. if l[i] in result:
  54. token = l[i]
  55. i += 1 # Consume token
  56. if i<l_len:
  57. if l[i] in result:
  58. # non-valued
  59. result[token] = (())
  60. elif l[i]=="(":
  61. # multi-valued
  62. i += 1 # Consume left parentheses
  63. start = i
  64. while i<l_len and l[i]!=")":
  65. i += 1
  66. result[token] = tuple(filter(lambda v:v!='$',l[start:i]))
  67. i += 1 # Consume right parentheses
  68. else:
  69. # single-valued
  70. result[token] = l[i],
  71. i += 1 # Consume single value
  72. else:
  73. i += 1 # Consume unrecognized item
  74. return result