Compare commits

...

2 Commits

Author SHA1 Message Date
Christopher Arndt 7f2c84f53d Add requests and html parsing example
Signed-off-by: Christopher Arndt <chris@chrisarndt.de>
2024-05-07 08:20:16 +02:00
Christopher Arndt 4c18aefce4 File reading examples improvements
Signed-off-by: Christopher Arndt <chris@chrisarndt.de>
2024-05-07 08:17:09 +02:00
7 changed files with 133 additions and 3 deletions

View File

@ -0,0 +1,5 @@
record 1: 100, 300, 50, 200
record 2 300, 450, 50, 100
record 3: 150, 200, 0, 150
record 4: 350, 200, 5O, 250
record 5: 50, 100, 200, 300

5
beispiele/data.txt Normal file
View File

@ -0,0 +1,5 @@
record 1: 100, 300, 50, 200
record 2: 300, 450, 50, 100
record 3: 150, 200, 0, 150
record 4: 350, 200, 50, 250
record 5: 50, 100, 200, 300

View File

@ -8,7 +8,7 @@ with open(filename, "r") as fileobj:
max_linelength = 0 max_linelength = 0
longest_line = -1 longest_line = -1
for i, line in enumerate(data): for i, line in enumerate(fileobj):
if len(line) > max_linelength: if len(line) > max_linelength:
max_linelength = len(line) max_linelength = len(line)
longest_line = i + 1 longest_line = i + 1

View File

@ -2,12 +2,15 @@
import sys import sys
if len(sys.argv) >= 2:
filename = sys.argv[1] filename = sys.argv[1]
else:
filename = "data.txt"
data = {} data = {}
with open(filename, "r") as fileobj: with open(filename, "r") as fileobj:
for i, line in enumerate(data): for i, line in enumerate(fileobj):
line = line.strip() # remove whitespace from start/end of line line = line.strip() # remove whitespace from start/end of line
if line.startswith('#'): if line.startswith('#'):

29
beispiele/readfile5.py Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
import sys
if len(sys.argv) >= 2:
filename = sys.argv[1]
else:
filename = "data.txt"
data = {}
try:
with open(filename, "r") as fileobj:
for i, line in enumerate(fileobj):
line = line.strip() # remove whitespace from start/end of line
if line.startswith('#'):
# ignore comment lines
continue
name, raw_data = line.split(":", 1) # split line at first colon
items = raw_data.split(",") # split raw data at commas
data[name.strip()] = items
except OSError as exc:
print(f"Could not open file {filename}: {exc}")
for key in data:
print(key, data[key])

36
beispiele/readfile6.py Normal file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env python
import sys
if len(sys.argv) >= 2:
filename = sys.argv[1]
else:
filename = "data.txt"
data = {}
try:
with open(filename, "r") as fileobj:
for i, line in enumerate(fileobj):
line = line.strip() # remove whitespace from start/end of line
if line.startswith('#'):
# ignore comment lines
continue
try:
name, raw_data = line.split(":", 1) # split line at first colon
except ValueError as exc:
print(f"Warning: could not parse line {i+1}: {exc}")
else:
try:
items = [int(item) for item in raw_data.split(",")] # split raw data at commas
except (ValueError, TypeError) as exc:
print(f"Warning: could not parse data on line {i+1}: {exc}")
data[name.strip()] = items
except OSError as exc:
print(f"Could not open file {filename}: {exc}")
for key in data:
print(key, data[key])

52
beispiele/wp_crawler1.py Normal file
View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
import sys
import urllib.parse as urlparse
import requests
import lxml.html
def find_see_also_links(tree):
# get heading element of "See also" section using XPath
see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()
# find following UL element, which contains the LI elements with the actual links
for sibling in see_also_heading.itersiblings():
hrefs = sibling.xpath("li/a")
if hrefs:
return [(href.text, href.get("href")) for href in hrefs]
if sibling.tag.startswith("h"):
break
def main():
if len(sys.argv) < 2:
sys.exit("Usage: wp_crawler.py <url>")
url = sys.argv[1]
url_parts = urlparse.urlsplit(url)
base_url = urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])
resp = requests.get(url)
if not resp.status_code == 200:
sys.exit(f"Could not retrive URL '{url}'. Aborting.")
tree = lxml.html.fromstring(resp.text)
links = find_see_also_links(tree)
if links:
for title, url in links:
if not url.startswith("https://"):
if url.startswith("/"):
url = urlparse.urljoin(base_url, url)
print(title, url)
if __name__ == '__main__':
main()