data scraper python

>>> from bs4 import BeautifulSoup >>> raw_html = open('contrived.html').read() >>> html = BeautifulSoup(raw_html, 'html.parser') >>> for p in html.select('p'): ... if p['id'] == 'walrus': ... print(p.text) 'I am the walrus'

>>> raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm') >>> html = BeautifulSoup(raw_html, 'html.parser') >>> for i, li in enumerate(html.select('li')): print(i, li.text) 0 Isaac Newton Archimedes Carl F. Gauss Leonhard Euler Bernhard Riemann 1 Archimedes Carl F. Gauss Leonhard Euler Bernhard Riemann 2 Carl F. Gauss Leonhard Euler Bernhard Riemann 3 Leonhard Euler Bernhard Riemann 4 Bernhard Riemann # 5 ... and many more...

def get_names(): """ Downloads the page where the list of mathematicians is found and returns a list of strings, one per mathematician """ url = 'http://www.fabpedigree.com/james/mathmen.htm' response = simple_get(url) if response is not None: html = BeautifulSoup(response, 'html.parser') names = set() for li in html.select('li'): for name in li.text.split('n'): if len(name) > 0: names.add(name.strip()) return list(names) # Raise an exception if we failed to get any data from the url raise Exception('Error retrieving contents at {}'.format(url))

def get_hits_on_name(name): """ Accepts a `name` of a mathematician and returns the number of hits that mathematician's Wikipedia page received in the last 60 days, as an `int` """ # url_root is a template string that is used to build a URL. url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE' response = simple_get(url_root.format(name)) if response is not None: html = BeautifulSoup(response, 'html.parser') hit_link = [a for a in html.select('a') if a['href'].find('latest-60') > -1] if len(hit_link) > 0: # Strip commas link_text = hit_link[0].text.replace(',', '') try: # Convert to integer return int(link_text) except: log_error("couldn't parse {} as an `int`".format(link_text)) log_error('No pageviews found for {}'.format(name)) return None

web scraper python

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

Posted by: Guest on August-09-2020

Source

Code answers related to "data scraper python"

Code answers related to "Python"

Browse Popular Code Answers by Language

Answers for "data scraper python"

Code answers related to "data scraper python"

Code answers related to "Python"

Python Answers by Framework

Browse Popular Code Answers by Language

Popular Programming Languages

Advertisements

Company

Compilers

Help

Connect with us