python-kurs-softed/beispiele/wp_crawler1.py

#!/usr/bin/env python

import sys
import urllib.parse as urlparse

import requests
import lxml.html


def find_see_also_links(tree):

    # get heading element of "See also" section using XPath
    see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()

    # find following UL element, which contains the LI elements with the actual links
    for sibling in see_also_heading.itersiblings():
        hrefs = sibling.xpath("li/a")

        if hrefs:
            return [(href.text, href.get("href")) for href in hrefs]

        if sibling.tag.startswith("h"):
            break


def main():
    if len(sys.argv) < 2:
        sys.exit("Usage: wp_crawler.py <url>")

    url = sys.argv[1]
    url_parts = urlparse.urlsplit(url)
    base_url =  urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])

    resp = requests.get(url)

    if not resp.status_code == 200:
        sys.exit(f"Could not retrive URL '{url}'. Aborting.")
    tree = lxml.html.fromstring(resp.text)

    links = find_see_also_links(tree)

    if links:
        for title, url in links:
            if not url.startswith("https://"):
                if url.startswith("/"):
                    url = urlparse.urljoin(base_url, url)

            print(title, url)


if __name__ == '__main__':
    main()
Add requests and html parsing example Signed-off-by: Christopher Arndt <chris@chrisarndt.de> 2024-05-07 08:20:16 +02:00			`#!/usr/bin/env python`

			`import sys`
			`import urllib.parse as urlparse`

			`import requests`
			`import lxml.html`


			`def find_see_also_links(tree):`

			`# get heading element of "See also" section using XPath`
			`see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()`

			`# find following UL element, which contains the LI elements with the actual links`
			`for sibling in see_also_heading.itersiblings():`
			`hrefs = sibling.xpath("li/a")`

			`if hrefs:`
			`return [(href.text, href.get("href")) for href in hrefs]`

			`if sibling.tag.startswith("h"):`
			`break`


			`def main():`
			`if len(sys.argv) < 2:`
			`sys.exit("Usage: wp_crawler.py <url>")`

			`url = sys.argv[1]`
			`url_parts = urlparse.urlsplit(url)`
			`base_url = urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])`

			`resp = requests.get(url)`

			`if not resp.status_code == 200:`
			`sys.exit(f"Could not retrive URL '{url}'. Aborting.")`
			`tree = lxml.html.fromstring(resp.text)`

			`links = find_see_also_links(tree)`

			`if links:`
			`for title, url in links:`
			`if not url.startswith("https://"):`
			`if url.startswith("/"):`
			`url = urlparse.urljoin(base_url, url)`

			`print(title, url)`


			`if __name__ == '__main__':`
			`main()`