4 분 소요

시카고 맛집 데이터 분석

1. 개요

  • https://www.chicagomag.com/chicago-magazine/november-2012/best-sandwiches-chicago/
  • chicago magazine the 50 best sandwiches

목표

50개 페이지에서 각 가게의 정보 가져오기

  • 가게이름
  • 대표메뉴
  • 대표메뉴의 가격
  • 가게주소

2. 메인페이지

from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import ssl

context = ssl._create_unverified_context()

url_base = "https://www.chicagomag.com/"
url_sub = "chicago-magazine/november-2012/best-sandwiches-chicago/"
url = url_base + url_sub

ua = UserAgent()
req = Request(url, headers={"User-Agent": ua.ie})
html = urlopen(req, context=context)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())

=>

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible">
   <link href="https://gmpg.org/xfn/11" rel="profile"/>
   <script src="https://cmp.osano.com/16A1AnRt2Fn8i1unj/f15ebf08-7008-40fe-
...
  </script>
 </body>
</html>
soup.find_all("div", class_="sammy"), len(soup.find_all("div", class_="sammy"))
# soup.select(".sammy"), len(soup.select(".sammy"))

=>

([<div class="sammy" style="position: relative;">
  <div class="sammyRank">1</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
  Old Oak Tap<br/>
  <em>Read more</em> </a></div>
  </div>,
  <div class="sammy" style="position: relative;">
  <div class="sammyRank">2</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-
...
  <div class="sammyListing"><a href="https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Phoebes-Bakery-The-Gatsby/"><b>The Gatsby</b><br/>
  Phoebe’s Bakery<br/>
  <em>Read more</em> </a></div>
  </div>],
 50)
tmp_one = soup.find_all("div", "sammy")[0]
type(tmp_one)

=>

bs4.element.Tag
tmp_one.find(class_="sammyRank").get_text()
# tmp_one.select_one(".sammyRank").text

=>

'1'
tmp_one.find("div", {"class":"sammyListing"}).text
# tmp_one.select_one(".sammyListing").get_text()

=>

'BLT\nOld Oak Tap\nRead more '
tmp_one.find("a")["href"]
# tmp_one.select_one("a").get("href")

=>

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'
import re

tmp_string = tmp_one.find(class_="sammyListing").get_text()
re.split(("\n|\r\n"), tmp_string)

=>

['BLT', 'Old Oak Tap', 'Read more ']
print(re.split(("\n|\r\n"), tmp_string)[0]) # menu
print(re.split(("\n|\r\n"), tmp_string)[1]) # cafe

=>

BLT
Old Oak Tap

DataFrame

from urllib.parse import urljoin

url_base = "http://www.chicagomag.com"

# 필요한 내용을 담을 빈 리스트
# 리스트로 하나씩 컬럼을 만들고, DataFrame으로 합칠 예정
rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all("div", "sammy") # soup.select(".sammy")

for item in list_soup:
    rank.append(item.find(class_="sammyRank").get_text())
    tmp_string = item.find(class_="sammyListing").get_text()
    main_menu.append(re.split(("\n|\r\n"), tmp_string)[0])
    cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
    url_add.append(urljoin(url_base, item.find("a")["href"]))
    
import pandas as pd

data = {
    "Rank": rank,
    "Menu": main_menu,
    "Cafe": cafe_name,
    "URL": url_add,
}

df = pd.DataFrame(data)
df.tail(2)
Rank Menu Cafe URL
48 49 Le Végétarien Toni Patisserie https://www.chicagomag.com/Chicago-Magazine/No...
49 50 The Gatsby Phoebe’s Bakery https://www.chicagomag.com/Chicago-Magazine/No...

컬럼 순서 변경

df = pd.DataFrame(data, columns=["Rank", "Cafe", "Menu", "URL"])
df.tail()
Rank Cafe Menu URL
45 46 Chickpea Kufta https://www.chicagomag.com/Chicago-Magazine/No...
46 47 The Goddess and Grocer Debbie’s Egg Salad https://www.chicagomag.com/Chicago-Magazine/No...
47 48 Zenwich Beef Curry https://www.chicagomag.com/Chicago-Magazine/No...
48 49 Toni Patisserie Le Végétarien https://www.chicagomag.com/Chicago-Magazine/No...
49 50 Phoebe’s Bakery The Gatsby https://www.chicagomag.com/Chicago-Magazine/No...

데이터 저장

df.to_csv(
    "../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8"
)

3. 하위페이지

requirements

import pandas as pd
from urllib.request import urlopen, Request
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

df = pd.read_csv("../data/03. best_sandwiches_list_chicago.csv", index_col=0)
df.tail()
Rank Cafe Menu URL
45 46 Chickpea Kufta https://www.chicagomag.com/Chicago-Magazine/No...
46 47 The Goddess and Grocer Debbie’s Egg Salad https://www.chicagomag.com/Chicago-Magazine/No...
47 48 Zenwich Beef Curry https://www.chicagomag.com/Chicago-Magazine/No...
48 49 Toni Patisserie Le Végétarien https://www.chicagomag.com/Chicago-Magazine/No...
49 50 Phoebe’s Bakery The Gatsby https://www.chicagomag.com/Chicago-Magazine/No...

df["URL"][0]

=>

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'


req = Request(df["URL"][0], headers={"user-agent":ua.ie})
html = urlopen(req, context=context).read()
soup_tmp = BeautifulSoup(html, "html.parser")
soup_tmp.find("p", "addy") # soup_find.select_one(".addy")

=>

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

regular expression

price_tmp = soup_tmp.find("p", "addy").text
price_tmp

=>

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

import re
re.split(".,", price_tmp)

=>

['\n$10. 2109 W. Chicago Ave', ' 773-772-040', ' theoldoaktap.com']

price_tmp = re.split(".,", price_tmp)[0]
price_tmp

=>

'\n$10. 2109 W. Chicago Ave'

tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
price_tmp[len(tmp) + 2:]

=>

'2109 W. Chicago Ave'

가격, 주소 추가

from tqdm import tqdm

price = []
address = []

for idx, row in tqdm(df.iterrows()):
    req = Request(row["URL"], headers={"user-agent":ua.ie})
    html = urlopen(req, context=context).read()
    soup_tmp = BeautifulSoup(html, "lxml")
    
    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()

    price.append(tmp)
    address.append(price_tmp[len(tmp) + 2:])

df["Price"] = price
df["Address"] = address
df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()
Cafe Menu Price Address
Rank
1 Old Oak Tap BLT $10. 2109 W. Chicago Ave
2 Au Cheval Fried Bologna $9. 800 W. Randolph St
3 Xoco Woodland Mushroom $9.50 445 N. Clark St
4 Al’s Deli Roast Beef $9.40 914 Noyes St
5 Publican Quality Meats PB&L $10. 825 W. Fulton Mkt

지도 시각화

requirements

import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm

gmaps_key = ""
gmaps = googlemaps.Client(key=gmaps_key)

lat = []
lng = []

for idx, row in tqdm(df.iterrows()):
    if not row["Address"] == "Multiple location":
        target_name = row["Address"] + ", " + "Chicago"
        # print(target_name)
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get("geometry")
        lat.append(location_output["location"]["lat"])
        lng.append(location_output["location"]["lng"])
        # location_output = gmaps_output[0]
    else:
        lat.append(np.nan)
        lng.append(np.nan)
        
df.tail()
Cafe Menu Price Address
Rank
46 Chickpea Kufta $8. 2018 W. Chicago Ave
47 The Goddess and Grocer Debbie’s Egg Salad $6.50 25 E. Delaware Pl
48 Zenwich Beef Curry $7.50 416 N. York St
49 Toni Patisserie Le Végétarien $8.75 65 E. Washington St
50 Phoebe’s Bakery The Gatsby $6.85 3351 N. Broadwa

df["lat"] = lat
df["lng"] = lng
df.tail()
Cafe Menu Price Address lat lng
Rank
46 Chickpea Kufta $8. 2018 W. Chicago Ave 41.896113 -87.677857
47 The Goddess and Grocer Debbie’s Egg Salad $6.50 25 E. Delaware Pl 41.898979 -87.627393
48 Zenwich Beef Curry $7.50 416 N. York St 41.910583 -87.940488
49 Toni Patisserie Le Végétarien $8.75 65 E. Washington St 41.883106 -87.625438
50 Phoebe’s Bakery The Gatsby $6.85 3351 N. Broadwa 41.943163 -87.644507

mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)

for idx, row in df.iterrows():
    if not row["Address"] == "Multiple location":
        folium.Marker(
            location=[row["lat"], row["lng"]],
            popup=row["Cafe"],
            tooltip=row["Menu"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa"
            )
        ).add_to(mapping)

mapping