New York City Airbnb Open Data

In this project, I will explore the NYC Airbnb listings and aim to find out listing that match with upcoming trip

In [0]:
#Import packages and libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!pip install geopandas
import geopandas as gpd
Requirement already satisfied: geopandas in /usr/local/lib/python3.6/dist-packages (0.6.1)
Requirement already satisfied: shapely in /usr/local/lib/python3.6/dist-packages (from geopandas) (1.6.4.post2)
Requirement already satisfied: pyproj in /usr/local/lib/python3.6/dist-packages (from geopandas) (2.4.0)
Requirement already satisfied: pandas>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from geopandas) (0.25.3)
Requirement already satisfied: fiona in /usr/local/lib/python3.6/dist-packages (from geopandas) (1.8.9.post2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (2018.9)
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (2.6.1)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (1.17.3)
Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (0.5.0)
Requirement already satisfied: six>=1.7 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (1.12.0)
Requirement already satisfied: munch in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (2.5.0)
Requirement already satisfied: click<8,>=4.0 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (7.0)
Requirement already satisfied: click-plugins>=1.0 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (1.1.1)
Requirement already satisfied: attrs>=17 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (19.3.0)
In [0]:
#Load data

from google.colab import drive
drive.mount ('/content/gdrive')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [0]:
data = pd.read_csv('gdrive/My Drive/dataset/AB_NYC_2019.csv')

First Inspection of Data

In [0]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
In [0]:
data.dtypes, data.columns
(id                                  int64
 name                               object
 host_id                             int64
 host_name                          object
 neighbourhood_group                object
 neighbourhood                      object
 latitude                          float64
 longitude                         float64
 room_type                          object
 price                               int64
 minimum_nights                      int64
 number_of_reviews                   int64
 last_review                        object
 reviews_per_month                 float64
 calculated_host_listings_count      int64
 availability_365                    int64
 dtype: object,
 Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
        'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
        'minimum_nights', 'number_of_reviews', 'last_review',
        'reviews_per_month', 'calculated_host_listings_count',
In [0]:
#Change name of 'neighbourhood_group' to 'boroname'

data.rename(columns={'neighbourhood_group':'boroname'}, inplace=True)
In [0]:
#Find out the listings by borough name
sns.scatterplot(x='longitude', y='latitude', hue='boroname', s=30, data=data)
<matplotlib.axes._subplots.AxesSubplot at 0x7f65c4dee588>

Listing in different Boroughs in NYC

In [0]:
# Count listing by borough

borough_count = data.groupby('boroname').agg('count').reset_index()
boroname id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 Bronx 1091 1090 1091 1090 1091 1091 1091 1091 1091 1091 1091 876 876 1091 1091
1 Brooklyn 20104 20098 20104 20095 20104 20104 20104 20104 20104 20104 20104 16447 16447 20104 20104
2 Manhattan 21661 21652 21661 21652 21661 21661 21661 21661 21661 21661 21661 16632 16632 21661 21661
3 Queens 5666 5666 5666 5664 5666 5666 5666 5666 5666 5666 5666 4574 4574 5666 5666
4 Staten Island 373 373 373 373 373 373 373 373 373 373 373 314 314 373 373
In [0]:
#Plot the count by borough
fig, ax1 = plt.subplots(1,1, figsize=(6,6))
sns.barplot(x='boroname', y='id', data=borough_count, ax=ax1, palette="plasma" )

ax1.set_title('Number of Listings by Borough', fontsize=15)
ax1.set_xlabel('Borough', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.tick_params(axis='both', labelsize=10)

Manhattan have more listings that other regions

In [0]:
#Using geopandas to bring in a base layer of NYC boroughs

nyc = gpd.read_file(gpd.datasets.get_path('nybb'))
BoroCode BoroName Shape_Leng Shape_Area geometry
0 5 Staten Island 330470.010332 1.623820e+09 MULTIPOLYGON (((970217.022 145643.332, 970227....
1 4 Queens 896344.047763 3.045213e+09 MULTIPOLYGON (((1029606.077 156073.814, 102957...
2 3 Brooklyn 741080.523166 1.937479e+09 MULTIPOLYGON (((1021176.479 151374.797, 102100...
3 1 Manhattan 359299.096471 6.364715e+08 MULTIPOLYGON (((981219.056 188655.316, 980940....
4 2 Bronx 464392.991824 1.186925e+09 MULTIPOLYGON (((1012821.806 229228.265, 101278...
In [0]:
#Rename the 'BoroName' to 'boroname', this help to join data for the next step
nyc.rename(columns={'BoroName':'boroname'}, inplace=True)
bc_geo = nyc.merge(borough_count,on='boroname')

BoroCode boroname Shape_Leng Shape_Area geometry id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 5 Staten Island 330470.010332 1.623820e+09 MULTIPOLYGON (((970217.022 145643.332, 970227.... 373 373 373 373 373 373 373 373 373 373 373 314 314 373 373
1 4 Queens 896344.047763 3.045213e+09 MULTIPOLYGON (((1029606.077 156073.814, 102957... 5666 5666 5666 5664 5666 5666 5666 5666 5666 5666 5666 4574 4574 5666 5666
2 3 Brooklyn 741080.523166 1.937479e+09 MULTIPOLYGON (((1021176.479 151374.797, 102100... 20104 20098 20104 20095 20104 20104 20104 20104 20104 20104 20104 16447 16447 20104 20104
3 1 Manhattan 359299.096471 6.364715e+08 MULTIPOLYGON (((981219.056 188655.316, 980940.... 21661 21652 21661 21652 21661 21661 21661 21661 21661 21661 21661 16632 16632 21661 21661
4 2 Bronx 464392.991824 1.186925e+09 MULTIPOLYGON (((1012821.806 229228.265, 101278... 1091 1090 1091 1090 1091 1091 1091 1091 1091 1091 1091 876 876 1091 1091
In [0]:
#Plot into the map count listing by borough

fig,ax = plt.subplots(1,1, figsize=(10,10))
bc_geo.plot(column='id', cmap='viridis_r', alpha=.5, ax=ax, legend=True)
bc_geo.apply(lambda x:ax.annotate(s=x.boroname, color='black', xy=x.geometry.centroid.coords[0],ha='center'), axis=1)
plt.title('Number of Airbnb Listings by NYC Borough')
(905464.7390380859, 1075092.8783935546, 112485.76063504723, 280480.4142594267)

We could see that most of listings locate in Brooklyn and Manhattan. In contract, Airbnb is not so popular in Bronx and Staten Island

In [0]:
#import shapely
from shapely import wkt

Since we don't have geometries of neighborhoods, so we could use CSV file from NYC Open Data Site. It has wkt in a geometry column

In [0]:
crs = {'init':'epsg:4326'}
geometry = gpd.points_from_xy(data.longitude, data.latitude)
geo_data = gpd.GeoDataFrame(data,crs=crs,geometry=geometry)
In [0]:
fig,ax = plt.subplots(figsize=(15,15))

plt.title("Number of Airbnb Listings")
(-74.28337047811617, -73.67222994890714, 40.4751445261289, 40.936503645041604)

For the upcoming trip, I would like to find 'Entire home/apt' option and prefer if it locate in Brooklyn. So let's plot a map to see locations of room type

In [0]:
fig,ax = plt.subplots(figsize=(15,15))
nyc.plot(ax=ax, alpha=0.4, edgecolor='black')
plt.title('Locations of room type')
(-74.28337047811617, -73.67222994890714, 40.4751445261289, 40.936503645041604)

Also find out different room type by borough

In [0]:
ax = sns.countplot(data['room_type'], hue=data['boroname'], palette='plasma')

Manhattan offer more entire home/apt than other regions. However, I am a student love travelling on budget. Therefore, I will focus on listings in Manhattan which have price under 65

In [0]:
id name host_id host_name boroname neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365 geometry
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355 POINT (-73.98377 40.75362)
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365 POINT (-73.94190 40.80902)
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0 POINT (-73.94399 40.79851)
5 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200 3 74 2019-06-22 0.59 1 129 POINT (-73.97500 40.74767)
7 5178 Large Furnished Room Near B'way 8967 Shunichi Manhattan Hell's Kitchen 40.76489 -73.98493 Private room 79 2 430 2019-06-24 3.47 1 220 POINT (-73.98493 40.76489)
In [0]:
data_manha_65 = data_manha[data_manha.price <65]
data_manha_65['label']=data_manha_65.apply(lambda x: (x['name'],'price:'+str(x['price'])),axis=1)
/usr/local/lib/python3.6/dist-packages/ SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation:
id name host_id host_name boroname neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365 geometry label
28 9518 SPACIOUS, LOVELY FURNISHED MANHATTAN BEDROOM 31374 Shon Manhattan Inwood 40.86482 -73.92106 Private room 44 3 108 2019-06-15 1.11 3 311 POINT (-73.92106 40.86482) (SPACIOUS, LOVELY FURNISHED MANHATTAN BEDROOM,...
30 9668 front room/double bed 32294 Ssameer Or Trip Manhattan Harlem 40.82245 -73.95104 Private room 50 3 242 2019-06-01 2.04 3 355 POINT (-73.95104 40.82245) (front room/double bed, price:50)
31 9704 Spacious 1 bedroom in luxe building 32045 Teri Manhattan Harlem 40.81305 -73.95466 Private room 52 2 88 2019-06-14 1.42 1 255 POINT (-73.95466 40.81305) (Spacious 1 bedroom in luxe building, price:52)
33 9783 back room/bunk beds 32294 Ssameer Or Trip Manhattan Harlem 40.82130 -73.95318 Private room 50 3 273 2019-07-01 2.37 3 359 POINT (-73.95318 40.82130) (back room/bunk beds, price:50)
39 12048 LowerEastSide apt share shortterm 1 7549 Ben Manhattan Lower East Side 40.71401 -73.98917 Shared room 40 1 214 2019-07-05 1.81 4 188 POINT (-73.98917 40.71401) (LowerEastSide apt share shortterm 1, price:40)

According to below map, you could not only see location of listings but also comments.

In [0]:
#install folium to create a map
!pip install git+
Collecting git+
  Cloning to /tmp/pip-req-build-5hmsb4gg
  Running command git clone -q /tmp/pip-req-build-5hmsb4gg
Requirement already satisfied: branca>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (0.3.1)
Requirement already satisfied: jinja2>=2.9 in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (2.10.3)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (1.17.3)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (2.21.0)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from branca>=0.3.0->folium==0.10.0+4.ga1e3686) (1.12.0)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.9->folium==0.10.0+4.ga1e3686) (1.1.1)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (1.24.3)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (2019.9.11)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (2.8)
Building wheels for collected packages: folium
  Building wheel for folium ( ... done
  Created wheel for folium: filename=folium-0.10.0+4.ga1e3686-py2.py3-none-any.whl size=91404 sha256=76c32d679a62b78113a4e0d16591e239b0a872f411c894898c2e5374d59f7c7f
  Stored in directory: /tmp/pip-ephem-wheel-cache-p30kn4ah/wheels/1e/e1/75/ecbc91fd5dd5d90befb0b533bf7492d38acffa033310731862
Successfully built folium
ERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.10.0+4.ga1e3686 which is incompatible.
Installing collected packages: folium
  Found existing installation: folium 0.8.3
    Uninstalling folium-0.8.3:
      Successfully uninstalled folium-0.8.3
Successfully installed folium-0.10.0+4.ga1e3686
In [0]:
import folium
from folium import plugins

for lat,lon,label in zip(data_manha_65.latitude,data_manha_65.longitude,data_manha_65.label):

manha_map = folium.Map([Lat,Long],zoom_start=12)

In [0]: