New York City Airbnb Open Data

In this project, I will explore the NYC Airbnb listings and aim to find out listing that match with upcoming trip

In [0]:
#Import packages and libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!pip install geopandas
import geopandas as gpd
Requirement already satisfied: geopandas in /usr/local/lib/python3.6/dist-packages (0.6.1)
Requirement already satisfied: shapely in /usr/local/lib/python3.6/dist-packages (from geopandas) (1.6.4.post2)
Requirement already satisfied: pyproj in /usr/local/lib/python3.6/dist-packages (from geopandas) (2.4.0)
Requirement already satisfied: pandas>=0.23.0 in /usr/local/lib/python3.6/dist-packages (from geopandas) (0.25.3)
Requirement already satisfied: fiona in /usr/local/lib/python3.6/dist-packages (from geopandas) (1.8.9.post2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (2018.9)
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (2.6.1)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.23.0->geopandas) (1.17.3)
Requirement already satisfied: cligj>=0.5 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (0.5.0)
Requirement already satisfied: six>=1.7 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (1.12.0)
Requirement already satisfied: munch in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (2.5.0)
Requirement already satisfied: click<8,>=4.0 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (7.0)
Requirement already satisfied: click-plugins>=1.0 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (1.1.1)
Requirement already satisfied: attrs>=17 in /usr/local/lib/python3.6/dist-packages (from fiona->geopandas) (19.3.0)
In [0]:
#Load data

from google.colab import drive
drive.mount ('/content/gdrive')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [0]:
data = pd.read_csv('gdrive/My Drive/dataset/AB_NYC_2019.csv')

First Inspection of Data

In [0]:
data.head(5)
Out[0]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
In [0]:
data.dtypes, data.columns
Out[0]:
(id                                  int64
 name                               object
 host_id                             int64
 host_name                          object
 neighbourhood_group                object
 neighbourhood                      object
 latitude                          float64
 longitude                         float64
 room_type                          object
 price                               int64
 minimum_nights                      int64
 number_of_reviews                   int64
 last_review                        object
 reviews_per_month                 float64
 calculated_host_listings_count      int64
 availability_365                    int64
 dtype: object,
 Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
        'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
        'minimum_nights', 'number_of_reviews', 'last_review',
        'reviews_per_month', 'calculated_host_listings_count',
        'availability_365'],
       dtype='object'))
In [0]:
#Change name of 'neighbourhood_group' to 'boroname'

data.rename(columns={'neighbourhood_group':'boroname'}, inplace=True)
In [0]:
#Find out the listings by borough name
plt.figure(figsize=(10,10))
sns.scatterplot(x='longitude', y='latitude', hue='boroname', s=30, data=data)
Out[0]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f65c4dee588>

Listing in different Boroughs in NYC

In [0]:
# Count listing by borough

borough_count = data.groupby('boroname').agg('count').reset_index()
borough_count
Out[0]:
boroname id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 Bronx 1091 1090 1091 1090 1091 1091 1091 1091 1091 1091 1091 876 876 1091 1091
1 Brooklyn 20104 20098 20104 20095 20104 20104 20104 20104 20104 20104 20104 16447 16447 20104 20104
2 Manhattan 21661 21652 21661 21652 21661 21661 21661 21661 21661 21661 21661 16632 16632 21661 21661
3 Queens 5666 5666 5666 5664 5666 5666 5666 5666 5666 5666 5666 4574 4574 5666 5666
4 Staten Island 373 373 373 373 373 373 373 373 373 373 373 314 314 373 373
In [0]:
#Plot the count by borough
fig, ax1 = plt.subplots(1,1, figsize=(6,6))
sns.barplot(x='boroname', y='id', data=borough_count, ax=ax1, palette="plasma" )


ax1.set_title('Number of Listings by Borough', fontsize=15)
ax1.set_xlabel('Borough', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.tick_params(axis='both', labelsize=10)

Manhattan have more listings that other regions

In [0]:
#Using geopandas to bring in a base layer of NYC boroughs

nyc = gpd.read_file(gpd.datasets.get_path('nybb'))
nyc.head(5)
Out[0]:
BoroCode BoroName Shape_Leng Shape_Area geometry
0 5 Staten Island 330470.010332 1.623820e+09 MULTIPOLYGON (((970217.022 145643.332, 970227....
1 4 Queens 896344.047763 3.045213e+09 MULTIPOLYGON (((1029606.077 156073.814, 102957...
2 3 Brooklyn 741080.523166 1.937479e+09 MULTIPOLYGON (((1021176.479 151374.797, 102100...
3 1 Manhattan 359299.096471 6.364715e+08 MULTIPOLYGON (((981219.056 188655.316, 980940....
4 2 Bronx 464392.991824 1.186925e+09 MULTIPOLYGON (((1012821.806 229228.265, 101278...
In [0]:
#Rename the 'BoroName' to 'boroname', this help to join data for the next step
nyc.rename(columns={'BoroName':'boroname'}, inplace=True)
bc_geo = nyc.merge(borough_count,on='boroname')

bc_geo
Out[0]:
BoroCode boroname Shape_Leng Shape_Area geometry id name host_id host_name neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 5 Staten Island 330470.010332 1.623820e+09 MULTIPOLYGON (((970217.022 145643.332, 970227.... 373 373 373 373 373 373 373 373 373 373 373 314 314 373 373
1 4 Queens 896344.047763 3.045213e+09 MULTIPOLYGON (((1029606.077 156073.814, 102957... 5666 5666 5666 5664 5666 5666 5666 5666 5666 5666 5666 4574 4574 5666 5666
2 3 Brooklyn 741080.523166 1.937479e+09 MULTIPOLYGON (((1021176.479 151374.797, 102100... 20104 20098 20104 20095 20104 20104 20104 20104 20104 20104 20104 16447 16447 20104 20104
3 1 Manhattan 359299.096471 6.364715e+08 MULTIPOLYGON (((981219.056 188655.316, 980940.... 21661 21652 21661 21652 21661 21661 21661 21661 21661 21661 21661 16632 16632 21661 21661
4 2 Bronx 464392.991824 1.186925e+09 MULTIPOLYGON (((1012821.806 229228.265, 101278... 1091 1090 1091 1090 1091 1091 1091 1091 1091 1091 1091 876 876 1091 1091
In [0]:
#Plot into the map count listing by borough

fig,ax = plt.subplots(1,1, figsize=(10,10))
bc_geo.plot(column='id', cmap='viridis_r', alpha=.5, ax=ax, legend=True)
bc_geo.apply(lambda x:ax.annotate(s=x.boroname, color='black', xy=x.geometry.centroid.coords[0],ha='center'), axis=1)
plt.title('Number of Airbnb Listings by NYC Borough')
plt.axis('off')
Out[0]:
(905464.7390380859, 1075092.8783935546, 112485.76063504723, 280480.4142594267)

We could see that most of listings locate in Brooklyn and Manhattan. In contract, Airbnb is not so popular in Bronx and Staten Island

In [0]:
#import shapely
from shapely import wkt

Since we don't have geometries of neighborhoods, so we could use CSV file from NYC Open Data Site. It has wkt in a geometry column

In [0]:
crs = {'init':'epsg:4326'}
geometry = gpd.points_from_xy(data.longitude, data.latitude)
geo_data = gpd.GeoDataFrame(data,crs=crs,geometry=geometry)
In [0]:
fig,ax = plt.subplots(figsize=(15,15))
nyc.plot(ax=ax,alpha=0.4,edgecolor='black')
geo_data.plot(column='id',ax=ax,legend=True,cmap='plasma',markersize=4)

plt.title("Number of Airbnb Listings")
plt.axis('off')
Out[0]:
(-74.28337047811617, -73.67222994890714, 40.4751445261289, 40.936503645041604)

For the upcoming trip, I would like to find 'Entire home/apt' option and prefer if it locate in Brooklyn. So let's plot a map to see locations of room type

In [0]:
fig,ax = plt.subplots(figsize=(15,15))
nyc.plot(ax=ax, alpha=0.4, edgecolor='black')
geo_data.plot(column='room_type',ax=ax,legend=True,cmap='plasma',markersize=4)
plt.title('Locations of room type')
plt.axis('off')
Out[0]:
(-74.28337047811617, -73.67222994890714, 40.4751445261289, 40.936503645041604)

Also find out different room type by borough

In [0]:
plt.figure(figsize=(10,10))
ax = sns.countplot(data['room_type'], hue=data['boroname'], palette='plasma')

Manhattan offer more entire home/apt than other regions. However, I am a student love travelling on budget. Therefore, I will focus on listings in Manhattan which have price under 65

In [0]:
data_manha_65=data[data.boroname=='Manhattan']
data_manha_65.head()
Out[0]:
id name host_id host_name boroname neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365 geometry
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355 POINT (-73.98377 40.75362)
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365 POINT (-73.94190 40.80902)
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0 POINT (-73.94399 40.79851)
5 5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200 3 74 2019-06-22 0.59 1 129 POINT (-73.97500 40.74767)
7 5178 Large Furnished Room Near B'way 8967 Shunichi Manhattan Hell's Kitchen 40.76489 -73.98493 Private room 79 2 430 2019-06-24 3.47 1 220 POINT (-73.98493 40.76489)
In [0]:
data_manha_65 = data_manha[data_manha.price <65]
data_manha_65['label']=data_manha_65.apply(lambda x: (x['name'],'price:'+str(x['price'])),axis=1)
data_manha_65.head()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Out[0]:
id name host_id host_name boroname neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365 geometry label
28 9518 SPACIOUS, LOVELY FURNISHED MANHATTAN BEDROOM 31374 Shon Manhattan Inwood 40.86482 -73.92106 Private room 44 3 108 2019-06-15 1.11 3 311 POINT (-73.92106 40.86482) (SPACIOUS, LOVELY FURNISHED MANHATTAN BEDROOM,...
30 9668 front room/double bed 32294 Ssameer Or Trip Manhattan Harlem 40.82245 -73.95104 Private room 50 3 242 2019-06-01 2.04 3 355 POINT (-73.95104 40.82245) (front room/double bed, price:50)
31 9704 Spacious 1 bedroom in luxe building 32045 Teri Manhattan Harlem 40.81305 -73.95466 Private room 52 2 88 2019-06-14 1.42 1 255 POINT (-73.95466 40.81305) (Spacious 1 bedroom in luxe building, price:52)
33 9783 back room/bunk beds 32294 Ssameer Or Trip Manhattan Harlem 40.82130 -73.95318 Private room 50 3 273 2019-07-01 2.37 3 359 POINT (-73.95318 40.82130) (back room/bunk beds, price:50)
39 12048 LowerEastSide apt share shortterm 1 7549 Ben Manhattan Lower East Side 40.71401 -73.98917 Shared room 40 1 214 2019-07-05 1.81 4 188 POINT (-73.98917 40.71401) (LowerEastSide apt share shortterm 1, price:40)

According to below map, you could not only see location of listings but also comments.

In [0]:
#install folium to create a map
!pip install git+https://github.com/python-visualization/folium
Collecting git+https://github.com/python-visualization/folium
  Cloning https://github.com/python-visualization/folium to /tmp/pip-req-build-5hmsb4gg
  Running command git clone -q https://github.com/python-visualization/folium /tmp/pip-req-build-5hmsb4gg
Requirement already satisfied: branca>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (0.3.1)
Requirement already satisfied: jinja2>=2.9 in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (2.10.3)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (1.17.3)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from folium==0.10.0+4.ga1e3686) (2.21.0)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from branca>=0.3.0->folium==0.10.0+4.ga1e3686) (1.12.0)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.9->folium==0.10.0+4.ga1e3686) (1.1.1)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (1.24.3)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (2019.9.11)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->folium==0.10.0+4.ga1e3686) (2.8)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... done
  Created wheel for folium: filename=folium-0.10.0+4.ga1e3686-py2.py3-none-any.whl size=91404 sha256=76c32d679a62b78113a4e0d16591e239b0a872f411c894898c2e5374d59f7c7f
  Stored in directory: /tmp/pip-ephem-wheel-cache-p30kn4ah/wheels/1e/e1/75/ecbc91fd5dd5d90befb0b533bf7492d38acffa033310731862
Successfully built folium
ERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.10.0+4.ga1e3686 which is incompatible.
Installing collected packages: folium
  Found existing installation: folium 0.8.3
    Uninstalling folium-0.8.3:
      Successfully uninstalled folium-0.8.3
Successfully installed folium-0.10.0+4.ga1e3686
In [0]:
import folium
from folium import plugins

Long=-73.92
Lat=40.86
manha_map=folium.Map([Lat,Long],zoom_start=12)
manha_rooms_map=plugins.MarkerCluster().add_to(manha_map)
for lat,lon,label in zip(data_manha_65.latitude,data_manha_65.longitude,data_manha_65.label):
    folium.Marker(location=[lat,lon],icon=None,popup=label).add_to(manha_rooms_map)
manha_map.add_child(manha_rooms_map)

manha_map = folium.Map([Lat,Long],zoom_start=12)


manha_map
Out[0]:
In [0]: