python-kurs-softed/beispiele/wp_crawler1.py

53 lines
1.3 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
import sys
import urllib.parse as urlparse
import requests
import lxml.html
def find_see_also_links(tree):
# get heading element of "See also" section using XPath
see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()
# find following UL element, which contains the LI elements with the actual links
for sibling in see_also_heading.itersiblings():
hrefs = sibling.xpath("li/a")
if hrefs:
return [(href.text, href.get("href")) for href in hrefs]
if sibling.tag.startswith("h"):
break
def main():
if len(sys.argv) < 2:
sys.exit("Usage: wp_crawler.py <url>")
url = sys.argv[1]
url_parts = urlparse.urlsplit(url)
base_url = urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])
resp = requests.get(url)
if not resp.status_code == 200:
sys.exit(f"Could not retrive URL '{url}'. Aborting.")
tree = lxml.html.fromstring(resp.text)
links = find_see_also_links(tree)
if links:
for title, url in links:
if not url.startswith("https://"):
if url.startswith("/"):
url = urlparse.urljoin(base_url, url)
print(title, url)
if __name__ == '__main__':
main()