From 7f2c84f53d9c0ad5e18ae24e9e512e61fdfecc47 Mon Sep 17 00:00:00 2001
From: Christopher Arndt <chris@chrisarndt.de>
Date: Tue, 7 May 2024 08:20:16 +0200
Subject: [PATCH] Add requests and html parsing example

Signed-off-by: Christopher Arndt <chris@chrisarndt.de>
---
 beispiele/wp_crawler1.py | 52 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 beispiele/wp_crawler1.py
diff --git a/beispiele/wp_crawler1.py b/beispiele/wp_crawler1.py
new file mode 100644
index 0000000..3ab01c3
--- /dev/null
+++ b/beispiele/wp_crawler1.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import sys
+import urllib.parse as urlparse
+
+import requests
+import lxml.html
+
+
+def find_see_also_links(tree):
+
+    # get heading element of "See also" section using XPath
+    see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()
+
+    # find following UL element, which contains the LI elements with the actual links
+    for sibling in see_also_heading.itersiblings():
+        hrefs = sibling.xpath("li/a")
+
+        if hrefs:
+            return [(href.text, href.get("href")) for href in hrefs]
+
+        if sibling.tag.startswith("h"):
+            break
+
+
+def main():
+    if len(sys.argv) < 2:
+        sys.exit("Usage: wp_crawler.py <url>")
+
+    url = sys.argv[1]
+    url_parts = urlparse.urlsplit(url)
+    base_url =  urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])
+
+    resp = requests.get(url)
+
+    if not resp.status_code == 200:
+        sys.exit(f"Could not retrive URL '{url}'. Aborting.")
+    tree = lxml.html.fromstring(resp.text)
+
+    links = find_see_also_links(tree)
+
+    if links:
+        for title, url in links:
+            if not url.startswith("https://"):
+                if url.startswith("/"):
+                    url = urlparse.urljoin(base_url, url)
+
+            print(title, url)
+
+
+if __name__ == '__main__':
+    main()