Add requests and html parsing example
Signed-off-by: Christopher Arndt <chris@chrisarndt.de>
This commit is contained in:
		
							parent
							
								
									4c18aefce4
								
							
						
					
					
						commit
						7f2c84f53d
					
				
							
								
								
									
										52
									
								
								beispiele/wp_crawler1.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								beispiele/wp_crawler1.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,52 @@
 | 
			
		||||
#!/usr/bin/env python
 | 
			
		||||
 | 
			
		||||
import sys
 | 
			
		||||
import urllib.parse as urlparse
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
import lxml.html
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def find_see_also_links(tree):
 | 
			
		||||
 | 
			
		||||
    # get heading element of "See also" section using XPath
 | 
			
		||||
    see_also_heading = tree.xpath("//span[@id='See_also']")[0].getparent()
 | 
			
		||||
 | 
			
		||||
    # find following UL element, which contains the LI elements with the actual links
 | 
			
		||||
    for sibling in see_also_heading.itersiblings():
 | 
			
		||||
        hrefs = sibling.xpath("li/a")
 | 
			
		||||
 | 
			
		||||
        if hrefs:
 | 
			
		||||
            return [(href.text, href.get("href")) for href in hrefs]
 | 
			
		||||
 | 
			
		||||
        if sibling.tag.startswith("h"):
 | 
			
		||||
            break
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
    if len(sys.argv) < 2:
 | 
			
		||||
        sys.exit("Usage: wp_crawler.py <url>")
 | 
			
		||||
 | 
			
		||||
    url = sys.argv[1]
 | 
			
		||||
    url_parts = urlparse.urlsplit(url)
 | 
			
		||||
    base_url =  urlparse.urlunsplit(list(url_parts[:3]) + ['', ''])
 | 
			
		||||
 | 
			
		||||
    resp = requests.get(url)
 | 
			
		||||
 | 
			
		||||
    if not resp.status_code == 200:
 | 
			
		||||
        sys.exit(f"Could not retrive URL '{url}'. Aborting.")
 | 
			
		||||
    tree = lxml.html.fromstring(resp.text)
 | 
			
		||||
 | 
			
		||||
    links = find_see_also_links(tree)
 | 
			
		||||
 | 
			
		||||
    if links:
 | 
			
		||||
        for title, url in links:
 | 
			
		||||
            if not url.startswith("https://"):
 | 
			
		||||
                if url.startswith("/"):
 | 
			
		||||
                    url = urlparse.urljoin(base_url, url)
 | 
			
		||||
 | 
			
		||||
            print(title, url)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    main()
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user