Project 4 - 시카고 맛집 데이터 분석

August 25, 2024 4 분 소요

시카고 맛집 데이터 분석

1. 개요

https://www.chicagomag.com/chicago-magazine/november-2012/best-sandwiches-chicago/
chicago magazine the 50 best sandwiches

목표

50개 페이지에서 각 가게의 정보 가져오기

가게이름
대표메뉴
대표메뉴의 가격
가게주소

2. 메인페이지

from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import ssl

context = ssl._create_unverified_context()

url_base = "https://www.chicagomag.com/"
url_sub = "chicago-magazine/november-2012/best-sandwiches-chicago/"
url = url_base + url_sub

ua = UserAgent()
req = Request(url, headers={"User-Agent": ua.ie})
html = urlopen(req, context=context)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())

=>

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible">
   <link href="https://gmpg.org/xfn/11" rel="profile"/>
   <script src="https://cmp.osano.com/16A1AnRt2Fn8i1unj/f15ebf08-7008-40fe-
...
  </script>
 </body>
</html>

soup.find_all("div", class_="sammy"), len(soup.find_all("div", class_="sammy"))
# soup.select(".sammy"), len(soup.select(".sammy"))

=>

([<div class="sammy" style="position: relative;">
  <div class="sammyRank">1</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
  Old Oak Tap<br/>
  <em>Read more</em> </a></div>
  </div>,
  <div class="sammy" style="position: relative;">
  <div class="sammyRank">2</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-
...
  <div class="sammyListing"><a href="https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Phoebes-Bakery-The-Gatsby/"><b>The Gatsby</b><br/>
  Phoebe’s Bakery<br/>
  <em>Read more</em> </a></div>
  </div>],
 50)

tmp_one = soup.find_all("div", "sammy")[0]
type(tmp_one)

=>

bs4.element.Tag

tmp_one.find(class_="sammyRank").get_text()
# tmp_one.select_one(".sammyRank").text

=>

'1'

tmp_one.find("div", {"class":"sammyListing"}).text
# tmp_one.select_one(".sammyListing").get_text()

=>

'BLT\nOld Oak Tap\nRead more '

tmp_one.find("a")["href"]
# tmp_one.select_one("a").get("href")

=>

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

import re

tmp_string = tmp_one.find(class_="sammyListing").get_text()
re.split(("\n|\r\n"), tmp_string)

=>

['BLT', 'Old Oak Tap', 'Read more ']

print(re.split(("\n|\r\n"), tmp_string)[0]) # menu
print(re.split(("\n|\r\n"), tmp_string)[1]) # cafe

=>

BLT
Old Oak Tap

DataFrame

from urllib.parse import urljoin

url_base = "http://www.chicagomag.com"

# 필요한 내용을 담을 빈 리스트
# 리스트로 하나씩 컬럼을 만들고, DataFrame으로 합칠 예정
rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all("div", "sammy") # soup.select(".sammy")

for item in list_soup:
    rank.append(item.find(class_="sammyRank").get_text())
    tmp_string = item.find(class_="sammyListing").get_text()
    main_menu.append(re.split(("\n|\r\n"), tmp_string)[0])
    cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
    url_add.append(urljoin(url_base, item.find("a")["href"]))
    
import pandas as pd

data = {
    "Rank": rank,
    "Menu": main_menu,
    "Cafe": cafe_name,
    "URL": url_add,
}

df = pd.DataFrame(data)
df.tail(2)

	Rank	Menu	Cafe	URL
48	49	Le Végétarien	Toni Patisserie	https://www.chicagomag.com/Chicago-Magazine/No...
49	50	The Gatsby	Phoebe’s Bakery	https://www.chicagomag.com/Chicago-Magazine/No...

컬럼 순서 변경

df = pd.DataFrame(data, columns=["Rank", "Cafe", "Menu", "URL"])
df.tail()

	Rank	Cafe	Menu	URL
45	46	Chickpea	Kufta	https://www.chicagomag.com/Chicago-Magazine/No...
46	47	The Goddess and Grocer	Debbie’s Egg Salad	https://www.chicagomag.com/Chicago-Magazine/No...
47	48	Zenwich	Beef Curry	https://www.chicagomag.com/Chicago-Magazine/No...
48	49	Toni Patisserie	Le Végétarien	https://www.chicagomag.com/Chicago-Magazine/No...
49	50	Phoebe’s Bakery	The Gatsby	https://www.chicagomag.com/Chicago-Magazine/No...

데이터 저장

df.to_csv(
    "../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8"
)

3. 하위페이지

requirements

import pandas as pd
from urllib.request import urlopen, Request
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

df = pd.read_csv("../data/03. best_sandwiches_list_chicago.csv", index_col=0)
df.tail()

	Rank	Cafe	Menu	URL
45	46	Chickpea	Kufta	https://www.chicagomag.com/Chicago-Magazine/No...
46	47	The Goddess and Grocer	Debbie’s Egg Salad	https://www.chicagomag.com/Chicago-Magazine/No...
47	48	Zenwich	Beef Curry	https://www.chicagomag.com/Chicago-Magazine/No...
48	49	Toni Patisserie	Le Végétarien	https://www.chicagomag.com/Chicago-Magazine/No...
49	50	Phoebe’s Bakery	The Gatsby	https://www.chicagomag.com/Chicago-Magazine/No...

df["URL"][0]

=>

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

req = Request(df["URL"][0], headers={"user-agent":ua.ie})
html = urlopen(req, context=context).read()
soup_tmp = BeautifulSoup(html, "html.parser")
soup_tmp.find("p", "addy") # soup_find.select_one(".addy")

=>

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

regular expression

price_tmp = soup_tmp.find("p", "addy").text
price_tmp

=>

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

import re
re.split(".,", price_tmp)

=>

['\n$10. 2109 W. Chicago Ave', ' 773-772-040', ' theoldoaktap.com']

price_tmp = re.split(".,", price_tmp)[0]
price_tmp

=>

'\n$10. 2109 W. Chicago Ave'

tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
price_tmp[len(tmp) + 2:]

=>

'2109 W. Chicago Ave'

가격, 주소 추가

from tqdm import tqdm

price = []
address = []

for idx, row in tqdm(df.iterrows()):
    req = Request(row["URL"], headers={"user-agent":ua.ie})
    html = urlopen(req, context=context).read()
    soup_tmp = BeautifulSoup(html, "lxml")
    
    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()

    price.append(tmp)
    address.append(price_tmp[len(tmp) + 2:])

df["Price"] = price
df["Address"] = address
df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()

	Cafe	Menu	Price	Address
Rank
1	Old Oak Tap	BLT	$10.	2109 W. Chicago Ave
2	Au Cheval	Fried Bologna	$9.	800 W. Randolph St
3	Xoco	Woodland Mushroom	$9.50	445 N. Clark St
4	Al’s Deli	Roast Beef	$9.40	914 Noyes St
5	Publican Quality Meats	PB&L	$10.	825 W. Fulton Mkt

지도 시각화

requirements

import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm

gmaps_key = ""
gmaps = googlemaps.Client(key=gmaps_key)

lat = []
lng = []

for idx, row in tqdm(df.iterrows()):
    if not row["Address"] == "Multiple location":
        target_name = row["Address"] + ", " + "Chicago"
        # print(target_name)
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get("geometry")
        lat.append(location_output["location"]["lat"])
        lng.append(location_output["location"]["lng"])
        # location_output = gmaps_output[0]
    else:
        lat.append(np.nan)
        lng.append(np.nan)
        
df.tail()

	Cafe	Menu	Price	Address
Rank
46	Chickpea	Kufta	$8.	2018 W. Chicago Ave
47	The Goddess and Grocer	Debbie’s Egg Salad	$6.50	25 E. Delaware Pl
48	Zenwich	Beef Curry	$7.50	416 N. York St
49	Toni Patisserie	Le Végétarien	$8.75	65 E. Washington St
50	Phoebe’s Bakery	The Gatsby	$6.85	3351 N. Broadwa

df["lat"] = lat
df["lng"] = lng
df.tail()

	Cafe	Menu	Price	Address	lat	lng
Rank
46	Chickpea	Kufta	$8.	2018 W. Chicago Ave	41.896113	-87.677857
47	The Goddess and Grocer	Debbie’s Egg Salad	$6.50	25 E. Delaware Pl	41.898979	-87.627393
48	Zenwich	Beef Curry	$7.50	416 N. York St	41.910583	-87.940488
49	Toni Patisserie	Le Végétarien	$8.75	65 E. Washington St	41.883106	-87.625438
50	Phoebe’s Bakery	The Gatsby	$6.85	3351 N. Broadwa	41.943163	-87.644507

mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)

for idx, row in df.iterrows():
    if not row["Address"] == "Multiple location":
        folium.Marker(
            location=[row["lat"], row["lng"]],
            popup=row["Cafe"],
            tooltip=row["Menu"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa"
            )
        ).add_to(mapping)

mapping

Twitter Facebook LinkedIn

Project 4 - 시카고 맛집 데이터 분석

시카고 맛집 데이터 분석

1. 개요

목표

2. 메인페이지

DataFrame

컬럼 순서 변경

데이터 저장

3. 하위페이지

requirements

regular expression

가격, 주소 추가

지도 시각화

requirements

공유하기

참고

Project 3 - Web Parsing

Project 8 - 인구 소멸 위기 지역 데이터 분석

Project 5 - 네이버 영화 평점 사이트 분석

Project 7 - Naver API에서 모은 몰스킨 데이터 분석