#!/usr/bin/env python import sys import urllib.parse as urlparse import requests import lxml.html def find_see_also_links(tree): # get heading element of "See also" section using XPath see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent() # find following UL element, which contains the LI elements with the actual links for sibling in see_also_heading.itersiblings(): hrefs = sibling.xpath("li/a") if hrefs: return [(href.text, href.get("href")) for href in hrefs] if sibling.tag.startswith("h"): break def main(): if len(sys.argv) < 2: sys.exit("Usage: wp_crawler.py ") url = sys.argv[1] url_parts = urlparse.urlsplit(url) base_url = urlparse.urlunsplit(list(url_parts[:3]) + ['', '']) resp = requests.get(url) if not resp.status_code == 200: sys.exit(f"Could not retrive URL '{url}'. Aborting.") tree = lxml.html.fromstring(resp.text) links = find_see_also_links(tree) if links: for title, url in links: if not url.startswith("https://"): if url.startswith("/"): url = urlparse.urljoin(base_url, url) print(title, url) if __name__ == '__main__': main()