Scraping Seattle Restaurants data from Yelp website

The goal of this project is to scrape Yelp website to scrape information of all restaurants listed in Seattle area. At the end of this project, we will get the following data about the restaurant:

  1. Name
  2. Address
  3. District
  4. No. of Reviews
  5. Rating
  6. Price Range
  7. Cuisine

I hope you enjoy this.

In [1]:
# importing packages
from bs4 import BeautifulSoup
import pandas as pd

from IPython.core.display import clear_output
from random import randint
from requests import get
from time import sleep
from time import time
start_time = time()

from warnings import warn
In [2]:
# setting up page number in the link to iterate through and lists for categories 

page = 0
requests = 0

name = []
address = []
district =[]
review = []
rating = []
price = []
In [3]:
# a for loop to collect the restaurants data from the yelp website

for page in range(0,361,30): #361 represents number of entries. Will vary if searched for some other place

    #edit the url as per your need
    url = 'https://www.yelp.ca/search?cflt=restaurants&find_loc=Seattle%2C%20WA%2C%20United%20States&start='+str(page)
    response = get(url)
    
    sleep(randint(8,15))
    
    #to get the progress status
    requests += 1
    elapsed_time = time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)
    
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
    
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    #class id for containers on yelp website
    rest_containers = html_soup.find_all('div', class_ = 'lemon--div__373c0__1mboc searchResult__373c0__1yggB border-color--default__373c0__2oFDT')[1:]
    
    #for loop to iterate through the restaurant containers
    for container in rest_containers:
        
        restaurant_name = container.h3.get_text(strip=True)
        restaurant_name = re.sub(r'^[\d.\s]+', '', restaurant_name)
        name.append(restaurant_name)

        # Address
        restaurant_address = container.select_one('[class*="secondaryAttributes"]').get_text(separator='|', strip=True).split('|')[1]
        address.append(restaurant_address)
        
        # District
        restaurant_district = container.select_one('[class*="secondaryAttributes"]').get_text(separator='|', strip=True).split('|')[-1]
        district.append(restaurant_district)
        
        # No. of Reviews
        restaurant_numReview = container.select_one('[class*="reviewCount"]').get_text(strip=True)
        restaurant_numReview = re.sub(r'[^\d.]', '', restaurant_numReview)
        review.append(restaurant_numReview)
        
        # Average Review
        restaurant_starCount = container.select_one('[class*="stars"][aria-label]')['aria-label']
        restaurant_starCount = re.sub(r'[^\d.]', '', restaurant_starCount)
        rating.append(restaurant_starCount)

        # Price and Cuisine
        pr = container.select_one('[class*="priceCategory"]')
        restaurant_price = pr.get_text(strip=True) if pr else '-'
        price.append(restaurant_price)
    
    page = page + 30 
Request:13; Frequency: 0.06899560900756525 requests/s
In [4]:
print(len(name))
print(len(address))
print(len(district))
print(len(review))
print(len(rating))
print(len(price))
390
390
390
390
390
390
In [5]:
# creating a dataframe of all the restaurant data 

rest_data = pd.DataFrame({'Restaurant Name': name,
'Address': address,
'District': district,
'No. of Reviews': review,
'Rating': rating,
'Price': price
})
print(rest_data.info())
rest_data.head(3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 6 columns):
Restaurant Name    390 non-null object
Address            390 non-null object
District           390 non-null object
No. of Reviews     390 non-null object
Rating             390 non-null object
Price              390 non-null object
dtypes: object(6)
memory usage: 18.4+ KB
None
Out[5]:
Restaurant Name Address District No. of Reviews Rating Price
0 The Pink Door Located in Downtown 4796 4.5 $$Italian,Wine Bars,Seafood
1 Paju Lower Queen Anne Lower Queen Anne 35 5 $$Korean
2 Fogón Cocina Mexicana 600 E Pine St Capitol Hill 1338 4.5 $$Mexican
In [6]:
# splitting up Cusine and Price data into different columns
rest_data['Cuisine'] = rest_data.Price.str.rpartition('$')[2]
rest_data['Price'] = rest_data.Price.str.rpartition('$')[0] + rest_data.Price.str.rpartition('$')[1]
rest_data.head()
Out[6]:
Restaurant Name Address District No. of Reviews Rating Price Cuisine
0 The Pink Door Located in Downtown 4796 4.5 $$ Italian,Wine Bars,Seafood
1 Paju Lower Queen Anne Lower Queen Anne 35 5 $$ Korean
2 Fogón Cocina Mexicana 600 E Pine St Capitol Hill 1338 4.5 $$ Mexican
3 Din Tai Fung Located in Downtown 1255 4 $$ Taiwanese,Shanghainese,Dim Sum
4 Stateside 300 E Pike St Capitol Hill 875 4 $$ Vietnamese,Asian Fusion,Cocktail Bars
In [8]:
# exporting data to csv file
rest_data.to_csv('Seattle Restaurants Data.csv', index=0)

Author: Amandeep Saluja