A first practice at web scraping using this page:
https://content.codecademy.com/courses/beautifulsoup/cacao/index.html
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
chocolate = requests.get("https://content.codecademy.com/courses/beautifulsoup/cacao/index.html")
soup = BeautifulSoup(chocolate.content, "html.parser")
# print(soup)
company_tags = soup.find_all(attrs={"class": "Company"})
companies = []
for company in company_tags[1:]:
companies.append(company.get_text())
print(companies[:10])
['A. Morin', 'A. Morin', 'A. Morin', 'A. Morin', 'A. Morin', 'A. Morin', 'A. Morin', 'A. Morin', 'A. Morin', 'A. Morin']
rating_tags = soup.find_all(attrs={"class": "Rating"})
ratings = []
for rating in rating_tags[1:]:
rate_text = rating.get_text()
rate_score = float(rate_text)
ratings.append(rate_score)
print(ratings[:10])
[3.75, 2.75, 3.0, 3.5, 3.5, 2.75, 3.5, 3.5, 3.75, 4.0]
df = pd.DataFrame({'Company': companies, 'Rating': ratings})
df = df.groupby("Company")["Rating"].mean().reset_index()
top_10 = df.nlargest(10, 'Rating')
top_10
Company | Rating | |
---|---|---|
380 | Tobago Estate (Pralus) | 4.000000 |
183 | Heirloom Cacao Preservation (Zokoko) | 3.875000 |
287 | Ocelot | 3.875000 |
14 | Amedei | 3.846154 |
249 | Matale | 3.812500 |
304 | Patric | 3.791667 |
191 | Idilio (Felchlin) | 3.775000 |
2 | Acalli | 3.750000 |
82 | Chocola'te | 3.750000 |
96 | Christopher Morel (Felchlin) | 3.750000 |
(but first reduce the company name down to a limit of 20 characters)
truncate_name = lambda x: x[:20] if len(x) > 20 else x
top_10['Company'] = top_10['Company'].apply(truncate_name)
plt.figure(figsize=(10, 6))
plt.bar(top_10['Company'], top_10['Rating'])
plt.xticks(rotation=45)
plt.ylabel('Mean Rating')
plt.title('Companies Producing the Most Highly Rated Chocolate')
plt.show()
bean_tags = soup.find_all(attrs={"class": "BroadBeanOrigin"})
BeanOrigin = []
for bean in bean_tags[1:]:
BeanOrigin.append(bean.get_text())
print(BeanOrigin[:10])
['Sao Tome', 'Togo', 'Togo', 'Togo', 'Peru', 'Venezuela', 'Cuba', 'Venezuela', 'Venezuela', 'Peru']
df1 = pd.DataFrame({'Bean Origin': BeanOrigin, 'Rating': ratings})
df1.head()
Bean Origin | Rating | |
---|---|---|
0 | Sao Tome | 3.75 |
1 | Togo | 2.75 |
2 | Togo | 3.00 |
3 | Togo | 3.50 |
4 | Peru | 3.50 |
df1 = df1.groupby("Bean Origin")["Rating"].mean().reset_index()
top_10 = df1.nlargest(10, 'Rating')
plt.figure(figsize=(10, 6))
plt.bar(top_10['Bean Origin'], top_10['Rating'])
plt.xticks(rotation=45)
plt.ylabel('Mean Rating')
plt.title('Regions Producing the Most Highly Rated Chocolate')
plt.show()