Add requests and html parsing example

Signed-off-by: Christopher Arndt <chris@chrisarndt.de>
2024-05-07 08:20:16 +02:00 · 2024-05-07 08:20:16 +02:00 · 7f2c84f53d
commit 7f2c84f53d
parent 4c18aefce4
1 changed files with 52 additions and 0 deletions
--- a/beispiele/wp_crawler1.py
+++ b/beispiele/wp_crawler1.py
@ -0,0 +1,52 @@
 #!/usr/bin/env python
 import sys
 import urllib.parse as urlparse
 import requests
 import lxml.html
 def find_see_also_links(tree):
    # get heading element of "See also" section using XPath
    see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()
    # find following UL element, which contains the LI elements with the actual links
    for sibling in see_also_heading.itersiblings():
        hrefs = sibling.xpath("li/a")
        if hrefs:
            return [(href.text, href.get("href")) for href in hrefs]
        if sibling.tag.startswith("h"):
            break
 def main():
    if len(sys.argv) < 2:
        sys.exit("Usage: wp_crawler.py <url>")
    url = sys.argv[1]
    url_parts = urlparse.urlsplit(url)
    base_url =  urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])
    resp = requests.get(url)
    if not resp.status_code == 200:
        sys.exit(f"Could not retrive URL '{url}'. Aborting.")
    tree = lxml.html.fromstring(resp.text)
    links = find_see_also_links(tree)
    if links:
        for title, url in links:
            if not url.startswith("https://"):
                if url.startswith("/"):
                    url = urlparse.urljoin(base_url, url)
            print(title, url)
 if __name__ == '__main__':
    main()