web scraper using python

import requests from bs4 import BeautifulSoup URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia' page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser')

#pip install beautifulsoup4 import os import requests from bs4 import BeautifulSoup url = "https://www.google.com/" reponse = requests.get(url) if reponse.ok: soup = BeautifulSoup(reponse.text, "lxml") title = str(soup.find("title")) title = title.replace("<title>", "") title = title.replace("</title>", "") print("The title is : " + str(title)) os.system("pause") #python (code name).py

import scrapy from ..items import SampletestItem #items class class QuoteTestSpider(scrapy.Spider): name = 'quote_test' start_urls = ['https://quotes.toscrape.com/'] def parse(self, response): items = SampletestItem() #items class quotes = response.css("div.quote") for quote in quotes: items['title'] = quote.css("span.text::text").get() items['author'] = quote.css(".author::text").get() items['tags'] = quote.css(".tags .tag::text").getall() yield items next_page = response.css(".next a::attr(href)").get() if next_page is not None: next_url = response.urljoin(next_page) yield scrapy.Request(next_url, callback=self.parse)

web scraping with python

# example of web scraping links using asyncio and using all cores
import asyncio, requests, aiohttp, os
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from bs4 import BeautifulSoup as BS

executor = ThreadPoolExecutor(max_workers=8)
loop = asyncio.get_event_loop()

async def make_requests():
    urls = ['http://www.filedropper.com/lister.php?id=0', 'http://www.filedropper.com/lister.php?id=1', 'http://www.filedropper.com/lister.php?id=2', 'http://www.filedropper.com/lister.php?id=3', 'http://www.filedropper.com/lister.php?id=4', 'http://www.filedropper.com/lister.php?id=5', 'http://www.filedropper.com/lister.php?id=6', 'http://www.filedropper.com/lister.php?id=7', 'http://www.filedropper.com/lister.php?id=8', 'http://www.filedropper.com/lister.php?id=9', 'http://www.filedropper.com/lister.php?id=a', 'http://www.filedropper.com/lister.php?id=b', 'http://www.filedropper.com/lister.php?id=c', 'http://www.filedropper.com/lister.php?id=d', 'http://www.filedropper.com/lister.php?id=e', 'http://www.filedropper.com/lister.php?id=f', 'http://www.filedropper.com/lister.php?id=g', 'http://www.filedropper.com/lister.php?id=h', 'http://www.filedropper.com/lister.php?id=i', 'http://www.filedropper.com/lister.php?id=j', 'http://www.filedropper.com/lister.php?id=k', 'http://www.filedropper.com/lister.php?id=l', 'http://www.filedropper.com/lister.php?id=m', 'http://www.filedropper.com/lister.php?id=n', 'http://www.filedropper.com/lister.php?id=o', 'http://www.filedropper.com/lister.php?id=p', 'http://www.filedropper.com/lister.php?id=q', 'http://www.filedropper.com/lister.php?id=r', 'http://www.filedropper.com/lister.php?id=s', 'http://www.filedropper.com/lister.php?id=t', 'http://www.filedropper.com/lister.php?id=u', 'http://www.filedropper.com/lister.php?id=v', 'http://www.filedropper.com/lister.php?id=w', 'http://www.filedropper.com/lister.php?id=x', 'http://www.filedropper.com/lister.php?id=y', 'http://www.filedropper.com/lister.php?id=z']

    futures = [loop.run_in_executor(executor, requests.get, url) for url in urls]
    await asyncio.wait(futures)

    for future in futures:
        soup = BS(future.result().content)
        for all_links in soup.find_all('a', href=True):
            print("URL:", all_links['href'])    
            with open('filedropper_com.txt', 'a') as f:
                f.write(all_links['href'] + 'n')

loop.run_until_complete(make_requests())

Posted by: Guest on February-24-2020

def get_names(): """ Downloads the page where the list of mathematicians is found and returns a list of strings, one per mathematician """ url = 'http://www.fabpedigree.com/james/mathmen.htm' response = simple_get(url) if response is not None: html = BeautifulSoup(response, 'html.parser') names = set() for li in html.select('li'): for name in li.text.split('n'): if len(name) > 0: names.add(name.strip()) return list(names) # Raise an exception if we failed to get any data from the url raise Exception('Error retrieving contents at {}'.format(url))

def simple_get(url): """ Attempts to get the content at `url` by making an HTTP GET request. If the content-type of response is some kind of HTML/XML, return the text content, otherwise return None. """ try: with closing(get(url, stream=True)) as resp: if is_good_response(resp): return resp.content else: return None except RequestException as e: log_error('Error during requests to {0} : {1}'.format(url, str(e))) return None def is_good_response(resp): """ Returns True if the response seems to be HTML, False otherwise. """ content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) def log_error(e): """ It is always a good idea to log errors. This function just prints them, but you can make it do anything. """ print(e)

Code answers related to "web scraper using python"

Code answers related to "Python"

Browse Popular Code Answers by Language

Answers for "web scraper using python"

Code answers related to "web scraper using python"

Code answers related to "Python"

Python Answers by Framework

Browse Popular Code Answers by Language

Popular Programming Languages

Advertisements

Company

Compilers

Help

Connect with us