123456789101112131415161718192021222324252627282930313233343536373839404142 |
- #!/usr/bin/env python3
- import re
- from string import Template
- import urllib
- try:
- from BeautifulSoup import BeautifulSoup
- except ImportError:
- from bs4 import BeautifulSoup
-
-
- def excludeAllergens(html_soup):
- exclude_allergens = html_soup.find_all("sup")
- for element in exclude_allergens:
- element.extract()
-
-
- def excludeForm(html_soup):
- excludeForm = html_soup.find('form')
- excludeForm.extract()
-
-
- def excludeImages(html_soup):
- exclude_img = html_soup.find_all("img")
- for element in exclude_img:
- element.extract()
-
-
- if __name__ == '__main__':
- url = 'https://www.werkswelt.de/?id=mohm'
- page = urllib.request.urlopen(url).read().decode()
- correctedPage = re.sub('/br', 'br', page)
- html_soup = BeautifulSoup(correctedPage, features='html.parser')
- parsed_html = html_soup.findAll('body')
- #parsed_html = html_soup.body.html.body
- excludeForm(parsed_html[1])
- excludeAllergens(parsed_html[1])
- excludeImages(parsed_html[1])
- template = Template('<html>\n<div style="background:black; color:white">\n$parsed_html\n</div>\n</html>')
-
- # write html-file
- with open("speiseplan.html", "w") as file:
- file.write(template.substitute(parsed_html=parsed_html[1].div.decode_contents()))
|