#!/usr/bin/env python3 import re from string import Template import urllib try: from BeautifulSoup import BeautifulSoup except ImportError: from bs4 import BeautifulSoup def excludeAllergens(html_soup): exclude_allergens = html_soup.find_all("sup") for element in exclude_allergens: element.extract() def excludeForm(html_soup): excludeForm = html_soup.find('form') excludeForm.extract() def excludeImages(html_soup): exclude_img = html_soup.find_all("img") for element in exclude_img: element.extract() if __name__ == '__main__': url = 'https://www.werkswelt.de/?id=mohm' page = urllib.request.urlopen(url).read().decode() correctedPage = re.sub('/br', 'br', page) html_soup = BeautifulSoup(correctedPage, features='html.parser') parsed_html = html_soup.findAll('body') #parsed_html = html_soup.body.html.body excludeForm(parsed_html[1]) excludeAllergens(parsed_html[1]) excludeImages(parsed_html[1]) template = Template('\n
\n$parsed_html\n
\n') # write html-file with open("speiseplan.html", "w") as file: file.write(template.substitute(parsed_html=parsed_html[1].div.decode_contents()))