Add requests and html parsing example
Signed-off-by: Christopher Arndt <chris@chrisarndt.de>
This commit is contained in:
parent
4c18aefce4
commit
7f2c84f53d
|
@ -0,0 +1,52 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import lxml.html
|
||||||
|
|
||||||
|
|
||||||
|
def find_see_also_links(tree):
|
||||||
|
|
||||||
|
# get heading element of "See also" section using XPath
|
||||||
|
see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()
|
||||||
|
|
||||||
|
# find following UL element, which contains the LI elements with the actual links
|
||||||
|
for sibling in see_also_heading.itersiblings():
|
||||||
|
hrefs = sibling.xpath("li/a")
|
||||||
|
|
||||||
|
if hrefs:
|
||||||
|
return [(href.text, href.get("href")) for href in hrefs]
|
||||||
|
|
||||||
|
if sibling.tag.startswith("h"):
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
sys.exit("Usage: wp_crawler.py <url>")
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
url_parts = urlparse.urlsplit(url)
|
||||||
|
base_url = urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])
|
||||||
|
|
||||||
|
resp = requests.get(url)
|
||||||
|
|
||||||
|
if not resp.status_code == 200:
|
||||||
|
sys.exit(f"Could not retrive URL '{url}'. Aborting.")
|
||||||
|
tree = lxml.html.fromstring(resp.text)
|
||||||
|
|
||||||
|
links = find_see_also_links(tree)
|
||||||
|
|
||||||
|
if links:
|
||||||
|
for title, url in links:
|
||||||
|
if not url.startswith("https://"):
|
||||||
|
if url.startswith("/"):
|
||||||
|
url = urlparse.urljoin(base_url, url)
|
||||||
|
|
||||||
|
print(title, url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue