From 7f2c84f53d9c0ad5e18ae24e9e512e61fdfecc47 Mon Sep 17 00:00:00 2001 From: Christopher Arndt Date: Tue, 7 May 2024 08:20:16 +0200 Subject: [PATCH] Add requests and html parsing example Signed-off-by: Christopher Arndt --- beispiele/wp_crawler1.py | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 beispiele/wp_crawler1.py diff --git a/beispiele/wp_crawler1.py b/beispiele/wp_crawler1.py new file mode 100644 index 0000000..3ab01c3 --- /dev/null +++ b/beispiele/wp_crawler1.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import sys +import urllib.parse as urlparse + +import requests +import lxml.html + + +def find_see_also_links(tree): + + # get heading element of "See also" section using XPath + see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent() + + # find following UL element, which contains the LI elements with the actual links + for sibling in see_also_heading.itersiblings(): + hrefs = sibling.xpath("li/a") + + if hrefs: + return [(href.text, href.get("href")) for href in hrefs] + + if sibling.tag.startswith("h"): + break + + +def main(): + if len(sys.argv) < 2: + sys.exit("Usage: wp_crawler.py ") + + url = sys.argv[1] + url_parts = urlparse.urlsplit(url) + base_url = urlparse.urlunsplit(list(url_parts[:3]) + ['', '']) + + resp = requests.get(url) + + if not resp.status_code == 200: + sys.exit(f"Could not retrive URL '{url}'. Aborting.") + tree = lxml.html.fromstring(resp.text) + + links = find_see_also_links(tree) + + if links: + for title, url in links: + if not url.startswith("https://"): + if url.startswith("/"): + url = urlparse.urljoin(base_url, url) + + print(title, url) + + +if __name__ == '__main__': + main()