/ Published in: Python
Expand |
Embed | Plain Text
Copy this code and paste it in your HTML
import re import urllib.request # Retrieves URLs from the HTML source code of a website def extractUrlsFrom(url, unique=True, sort=True, restrictToTld=None): # Prepend "www." if not present if url[0:4] != "www.": url = "".join(["www.",url]) # Open a connection with urllib.request.urlopen("http://" + url) as h: # Grab the headers headers = h.info() # Default charset charset = "ISO-8859-1" # If a charset is in the headers then override the default for i in headers: match = re.search(r"charset=([\w\-]+)", headers[i], re.I) if match != None: charset = match.group(1).lower() break # Grab and decode the source code source = h.read().decode(charset) # Find all URLs in the source code matches = re.findall(r"http\:\/\/(www.)?([a-z0-9\-\.]+\.[a-z]{2,6})\b", source, re.I) # Abort if no URLs were found if matches == None: return None # Collect URLs collection = [] # Go over URLs one by one for url in matches: url = url[1].lower() # If there are more than one dot then the URL contains # subdomain(s), which we remove if url.count(".") > 1: temp = url.split(".") tld = temp.pop() url = "".join([temp.pop(),".",tld]) # Restrict to TLD if one is set if restrictToTld: tld = url.split(".").pop() if tld != restrictToTld: continue # If only unique URLs should be returned if unique: if url not in collection: collection.append(url) # Otherwise just add the URL to the collection else: collection.append(url) # Done return sorted(collection) if sort else collection # Test url = "snipplr.com" print("From:", url) for x in extractUrlsFrom(url): print("-", x)