python-kurs-softed/beispiele/http1.py

34 lines
958 B
Python

import sys
import requests
import lxml.html
base_url = "https://en.wikipedia.org/"
if len(sys.argv) >= 2:
url = base_url + sys.argv[1]
else:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
resp = requests.get(url)
see_also = []
if resp.status_code == 200:
html = resp.text
#print(html[:100])
tree = lxml.html.fromstring(html)
see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()
for sibling in see_also_heading.itersiblings():
hrefs = sibling.xpath("li/a")
for href in hrefs:
title = href.get("title")
see_also_url = href.get("href")
#print(title, see_also_url)
## TODO
## Parse URL from command line with urlllib.parse.urlsplit()
## and concatenate base URL and relative URL from link
see_also.append((title, base_url + see_also_url))
print(see_also)