bigquery-public-data:new_york_taxi_trips
# !pip3 install pandas-gbq
project_id = 'your_project_id'
import pandas as pd
import numpy as np
query = """
SELECT
DATETIME_TRUNC(pickup_datetime, hour) as pickup_hour,
count(*) as cnt
FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015`
WHERE EXTRACT(MONTH from pickup_datetime) = 1
GROUP BY pickup_hour
ORDER BY pickup_hour
"""
df = pd.read_gbq(query=query, dialect='standard',
project_id=project_id,
reauth=True,
auth_local_webserver=True)
df.tail()
pickup_hour | cnt | |
---|---|---|
739 | 2015-01-31 19:00:00 | 32436 |
740 | 2015-01-31 20:00:00 | 27555 |
741 | 2015-01-31 21:00:00 | 27477 |
742 | 2015-01-31 22:00:00 | 29862 |
743 | 2015-01-31 23:00:00 | 29856 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 744 entries, 0 to 743 Data columns (total 2 columns): pickup_hour 744 non-null datetime64[ns] cnt 744 non-null int64 dtypes: datetime64[ns](1), int64(1) memory usage: 11.7 KB
def reduce_mem_usage(df):
"""
iterate through all the columns of a dataframe and
modify the data type to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print(f'Memory usage of dataframe is {start_mem:.2f}MB')
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max <\
np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max <\
np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max <\
np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max <\
np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
elif str(col_type)[:5] == 'float':
if c_min > np.finfo(np.float16).min and c_max <\
np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max <\
np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
pass
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print(f'Memory usage after optimization is: {end_mem:.2f}MB')
print(f'Decreased by {100*((start_mem - end_mem)/start_mem):.1f}%')
return df
new_df = reduce_mem_usage(df)
Memory usage of dataframe is 0.01MB Memory usage after optimization is: 0.01MB Decreased by 24.8%
new_df.info()
# 11.7KB -> 8.8KB, 데이터가 작아서 이렇고 데이터가 많다면 효과는 더 좋겠죠?
<class 'pandas.core.frame.DataFrame'> RangeIndex: 744 entries, 0 to 743 Data columns (total 2 columns): pickup_hour 744 non-null datetime64[ns] cnt 744 non-null int32 dtypes: datetime64[ns](1), int32(1) memory usage: 8.8 KB
df['col_name'].value_counts(ascending=True)
# 사실 여기선 이미 BigQuery에서 counts하고 옴. 하지만 다른 경우 유용하게 사용
new_df['pickup_hour'].value_counts().head()
2015-01-22 23:00:00 1 2015-01-25 08:00:00 1 2015-01-28 02:00:00 1 2015-01-17 07:00:00 1 2015-01-06 12:00:00 1 Name: pickup_hour, dtype: int64
new_df['pickup_hour'].dt
<pandas.core.indexes.accessors.DatetimeProperties object at 0x11c9814a8>
new_df['year'] = new_df['pickup_hour'].dt.year
new_df['month'] = new_df['pickup_hour'].dt.month
new_df['day'] = new_df['pickup_hour'].dt.day
new_df['hour'] = new_df['pickup_hour'].dt.hour
new_df['weekday'] = new_df['pickup_hour'].dt.weekday
%%timeit 100
new_df[new_df['month'] == 1][new_df['day'] == 10][new_df['hour']==10]
/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
2.35 ms ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit 100
new_df[(new_df['month'] == 1) & (new_df['day']==10) & (new_df['hour']==10)]
1.67 ms ± 28.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
new_df.loc[new_df['day'].isin(['10', '11'])].head()
pickup_hour | cnt | year | month | day | hour | weekday | |
---|---|---|---|---|---|---|---|
216 | 2015-01-10 00:00:00 | 26586 | 2015 | 1 | 10 | 0 | 5 |
217 | 2015-01-10 01:00:00 | 21782 | 2015 | 1 | 10 | 1 | 5 |
218 | 2015-01-10 02:00:00 | 17535 | 2015 | 1 | 10 | 2 | 5 |
219 | 2015-01-10 03:00:00 | 12487 | 2015 | 1 | 10 | 3 | 5 |
220 | 2015-01-10 04:00:00 | 7673 | 2015 | 1 | 10 | 4 | 5 |
%%timeit 100
new_df.loc[(new_df['month']==1) & (new_df['day']== 10)].head()
1.35 ms ± 37.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%%timeit 100
new_df.query('month == 1 & day == 10').head()
2.16 ms ± 43.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
df.pivot_table('cnt', index='hour', columns='weekday', aggfunc='mean').head()
weekday | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
hour | |||||||
0 | 8552.75 | 6719.25 | 8787.00 | 14326.4 | 14054.2 | 25487.2 | 26099.00 |
1 | 5360.50 | 3707.00 | 5283.25 | 11440.4 | 8785.6 | 21118.6 | 22754.75 |
2 | 3675.25 | 2361.75 | 3337.75 | 8753.6 | 5856.8 | 16382.0 | 18918.00 |
3 | 2730.00 | 1504.00 | 2307.00 | 7030.2 | 3948.6 | 12112.0 | 14226.50 |
4 | 2691.25 | 1633.75 | 2297.25 | 5444.8 | 3341.8 | 7730.4 | 8549.25 |
df.groupby(['hour', 'weekday'])['cnt'].mean().unstack().head()
weekday | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
hour | |||||||
0 | 8552.75 | 6719.25 | 8787.00 | 14326.4 | 14054.2 | 25487.2 | 26099.00 |
1 | 5360.50 | 3707.00 | 5283.25 | 11440.4 | 8785.6 | 21118.6 | 22754.75 |
2 | 3675.25 | 2361.75 | 3337.75 | 8753.6 | 5856.8 | 16382.0 | 18918.00 |
3 | 2730.00 | 1504.00 | 2307.00 | 7030.2 | 3948.6 | 12112.0 | 14226.50 |
4 | 2691.25 | 1633.75 | 2297.25 | 5444.8 | 3341.8 | 7730.4 | 8549.25 |
pd.date_range(start='2018-01-01', end='2019-01-01', freq='1H')
DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00', '2018-01-01 02:00:00', '2018-01-01 03:00:00', '2018-01-01 04:00:00', '2018-01-01 05:00:00', '2018-01-01 06:00:00', '2018-01-01 07:00:00', '2018-01-01 08:00:00', '2018-01-01 09:00:00', ... '2018-12-31 15:00:00', '2018-12-31 16:00:00', '2018-12-31 17:00:00', '2018-12-31 18:00:00', '2018-12-31 19:00:00', '2018-12-31 20:00:00', '2018-12-31 21:00:00', '2018-12-31 22:00:00', '2018-12-31 23:00:00', '2019-01-01 00:00:00'], dtype='datetime64[ns]', length=8761, freq='H')
pd.date_range(start='2018-01-01', end='2019-01-01', freq='1D')
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08', '2018-01-09', '2018-01-10', ... '2018-12-23', '2018-12-24', '2018-12-25', '2018-12-26', '2018-12-27', '2018-12-28', '2018-12-29', '2018-12-30', '2018-12-31', '2019-01-01'], dtype='datetime64[ns]', length=366, freq='D')
# 이번 데이터셋엔 str 값이 없으므로.. hour를 임시로 str로 바꾸고 써볼게요
new_df['string_hour'] = new_df['hour'].astype(str)
# 여러 조건일 경우 | 사용
new_df.loc[new_df['string_hour'].str.contains("10|11")==True].head()
pickup_hour | cnt | year | month | day | hour | weekday | string_hour | |
---|---|---|---|---|---|---|---|---|
10 | 2015-01-01 10:00:00 | 9949 | 2015 | 1 | 1 | 10 | 3 | 10 |
11 | 2015-01-01 11:00:00 | 13870 | 2015 | 1 | 1 | 11 | 3 | 11 |
34 | 2015-01-02 10:00:00 | 15243 | 2015 | 1 | 2 | 10 | 4 | 10 |
35 | 2015-01-02 11:00:00 | 16999 | 2015 | 1 | 2 | 11 | 4 | 11 |
58 | 2015-01-03 10:00:00 | 14975 | 2015 | 1 | 3 | 10 | 5 | 10 |
new_df = new_df.set_index('pickup_hour')
new_df['moving_average'] = new_df['cnt'].rolling(window=3, center=False).mean()
new_df['lag_cnt'] = new_df['cnt'].shift(1)
new_df.head()
cnt | year | month | day | hour | weekday | string_hour | moving_average | lag_cnt | |
---|---|---|---|---|---|---|---|---|---|
pickup_hour | |||||||||
2015-01-01 00:00:00 | 28312 | 2015 | 1 | 1 | 0 | 3 | 0 | NaN | NaN |
2015-01-01 01:00:00 | 31707 | 2015 | 1 | 1 | 1 | 3 | 1 | NaN | 28312.0 |
2015-01-01 02:00:00 | 28068 | 2015 | 1 | 1 | 2 | 3 | 2 | 29362.333333 | 31707.0 |
2015-01-01 03:00:00 | 24288 | 2015 | 1 | 1 | 3 | 3 | 3 | 28021.000000 | 28068.0 |
2015-01-01 04:00:00 | 17081 | 2015 | 1 | 1 | 4 | 3 | 4 | 23145.666667 | 24288.0 |
# display max row 설정
pd.set_option('display.max_rows', 1000)
# Plot 그릴시 레티나로 그리기(Mac에서 더 또렷하게)
%config InlineBackend.figure_format = 'retina'
# 이걸 설정하면 한 셀에서 Multiple Output Print 가능
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# !pip3 install cufflinks==0.16
# !pip3 install plotly==3.10.0
# !pip3 install seaborn
import plotly.plotly as py
import cufflinks as cf
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
print(cf.__version__)
%config InlineBackend.figure_format = 'retina'
cf.go_offline()
0.16
%%time
query = """
SELECT
DATETIME_TRUNC(pickup_datetime, hour) as pickup_hour,
count(*) as cnt
FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015`
WHERE EXTRACT(MONTH from pickup_datetime) = 1
GROUP BY pickup_hour
ORDER BY pickup_hour
"""
df = pd.read_gbq(query=query, dialect='standard', project_id=project_id)
CPU times: user 54.8 ms, sys: 13.9 ms, total: 68.7 ms Wall time: 4.64 s
여러 그래프
를 그리고 싶은 경우 아래의 형태로 변환해야 함| index | column_a | column_b | column_c
| 1 | 1000 | 1500 | 1200
| 2 | 1400 | 1000 | 1200
| 3 | 1800 | 800 | 1200
# pickup_hour가 이미 datetime이라 아래 to_datetime은 안해도 되지만,
# 혹시 필요하신 분을 위해 남겨둠
# df['pickup_hour'] = pd.to_datetime(df['pickup_hour'])
df = df.set_index('pickup_hour')
df.head()
cnt | |
---|---|
pickup_hour | |
2015-01-01 00:00:00 | 28312 |
2015-01-01 01:00:00 | 31707 |
2015-01-01 02:00:00 | 28068 |
2015-01-01 03:00:00 | 24288 |
2015-01-01 04:00:00 | 17081 |
layout1 = cf.Layout(
height=500,
width=800
)
df.iplot(kind='scatter',xTitle='Datetimes',yTitle='Demand',title='NYC Taxi Demand(2015-01)', layout=layout1)
# 그냥 plot 사용시
df.plot(kind='line', figsize=(12, 5));
df['date'] = df.index.date
df.groupby(['date'])[['cnt']].sum().head()
cnt | |
---|---|
date | |
2015-01-01 | 382014 |
2015-01-02 | 345296 |
2015-01-03 | 406769 |
2015-01-04 | 328848 |
2015-01-05 | 363454 |
df.groupby(['date'])[['cnt']].sum().iplot(layout=layout1)
df.groupby(['date'])[['cnt']].sum().plot(kind='line', figsize=(12, 5));
# Feature Engineering
df['weekday'] = df.index.weekday
df['hour'] = df.index.hour
df['weeknum'] = df.index.week
df['is_weekend'] = ((pd.DatetimeIndex(df.index).dayofweek) // 5 == 1).astype(int)
df.groupby('hour')[['cnt']].sum().head(5)
cnt | |
---|---|
hour | |
0 | 469971 |
1 | 355145 |
2 | 268133 |
3 | 198524 |
4 | 143271 |
df.groupby('hour')[['cnt']].sum().iplot(layout=layout1)
df.groupby('hour')['cnt'].sum().plot(x='hour', y='cnt', kind='line', style="-o", figsize=(15,5));
# Heatmap은 보통 Interactive가 필요 없어서 seaborn 사용하는걸 추천
plt.figure(figsize=(12,8))
sns.heatmap(df.groupby(['hour', 'weekday'])['cnt'].mean().unstack(),
lw=.5, annot=True, cmap='GnBu', fmt='g', annot_kws={'size':10});
df.groupby(['hour', 'weekday'])[['cnt']].mean().head(10)
cnt | ||
---|---|---|
hour | weekday | |
0 | 0 | 8552.75 |
1 | 6719.25 | |
2 | 8787.00 | |
3 | 14326.40 | |
4 | 14054.20 | |
5 | 25487.20 | |
6 | 26099.00 | |
1 | 0 | 5360.50 |
1 | 3707.00 | |
2 | 5283.25 |
df.groupby(['hour', 'weekday'])['cnt'].mean().unstack().head(3)
weekday | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
hour | |||||||
0 | 8552.75 | 6719.25 | 8787.00 | 14326.4 | 14054.2 | 25487.2 | 26099.00 |
1 | 5360.50 | 3707.00 | 5283.25 | 11440.4 | 8785.6 | 21118.6 | 22754.75 |
2 | 3675.25 | 2361.75 | 3337.75 | 8753.6 | 5856.8 | 16382.0 | 18918.00 |
average_df = df.groupby(['is_weekend', 'hour']).mean()['cnt'].\
unstack(level=0).rename(columns={0:"weekday", 1:"weekend"})
average_df.iplot(layout=layout1)
sns.lineplot(data=average_df);
df.groupby(['weekday','hour'])[['cnt']].sum().head()
cnt | ||
---|---|---|
weekday | hour | |
0 | 0 | 34211 |
1 | 21442 | |
2 | 14701 | |
3 | 10920 | |
4 | 10765 |
# 아마 이걸 원하시진 않으셨을듯
df.groupby(['weekday','hour'])[['cnt']].sum().plot();
df.groupby(['weekday','hour'])['cnt'].sum().unstack(level=0).head(5)
weekday | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
hour | |||||||
0 | 34211 | 26877 | 35148 | 71632 | 70271 | 127436 | 104396 |
1 | 21442 | 14828 | 21133 | 57202 | 43928 | 105593 | 91019 |
2 | 14701 | 9447 | 13351 | 43768 | 29284 | 81910 | 75672 |
3 | 10920 | 6016 | 9228 | 35151 | 19743 | 60560 | 56906 |
4 | 10765 | 6535 | 9189 | 27224 | 16709 | 38652 | 34197 |
df.groupby(['weekday','hour'])['cnt'].sum().unstack(level=0).plot();
# reset_index()는 왜 하나요?
df.groupby(['weekday', 'hour']).mean()['cnt'].unstack(level=0).\
melt(id_vars="hour", value_vars=[0,1,2,3,4,5,6], value_name='cnt').head(5)
hour | weekday | cnt | |
---|---|---|---|
0 | NaN | 0 | 8552.75 |
1 | NaN | 0 | 5360.50 |
2 | NaN | 0 | 3675.25 |
3 | NaN | 0 | 2730.00 |
4 | NaN | 0 | 2691.25 |
# melt는 index를 인식 못함
df.groupby(['weekday', 'hour']).mean()['cnt'].unstack(level=0).reset_index().\
melt(id_vars="hour", value_vars=[0,1,2,3,4,5,6], value_name='cnt').head(5)
hour | weekday | cnt | |
---|---|---|---|
0 | 0 | 0 | 8552.75 |
1 | 1 | 0 | 5360.50 |
2 | 2 | 0 | 3675.25 |
3 | 3 | 0 | 2730.00 |
4 | 4 | 0 | 2691.25 |
# factor plot은 df.groupby(['weekday','hour'])['cnt'].sum().unstack(level=0).head(5)와 input 데이터 형태가 다름
data = df.groupby(['weekday', 'hour']).mean()['cnt'].unstack(level=0).reset_index()
data = data.melt(id_vars="hour", value_vars=[0,1,2,3,4,5,6], value_name='cnt')
sns.factorplot(x="hour", y='cnt', hue="weekday", data=data, height=5, aspect=3);
plt.figure(figsize=(16, 6));
sns.boxplot(x='hour', y='cnt', data=df);
plt.title("Hourly Box Plot(2015-01 Data)");
def visualize_hourly_boxplot_by_weeknum(df, y, weeknum):
plt.figure(figsize=(16, 6));
sns.boxplot(x='hour', y=y, data=df[df['weeknum']==weeknum]);
plt.title(f"Hourly Box Plot(2015-{weeknum:02} Data)");
for week in range(1, 3):
visualize_hourly_boxplot_by_weeknum(df, 'cnt', week)
import ipywidgets as widgets
from ipywidgets import interact
# 이걸 설정하면 Multiple Output이 가능함
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
def f(x):
return x
interact(f, x=10);
interact(f, x=widgets.IntSlider(min=-10, max=30, step=1, value=10));
interact(f, x=['apples','oranges']);
@interact(x=True, y=1.0)
def g(x, y):
return (x, y)
df.head()
cnt | date | weekday | hour | weeknum | is_weekend | |
---|---|---|---|---|---|---|
pickup_hour | ||||||
2015-01-01 00:00:00 | 28312 | 2015-01-01 | 3 | 0 | 1 | 0 |
2015-01-01 01:00:00 | 31707 | 2015-01-01 | 3 | 1 | 1 | 0 |
2015-01-01 02:00:00 | 28068 | 2015-01-01 | 3 | 2 | 1 | 0 |
2015-01-01 03:00:00 | 24288 | 2015-01-01 | 3 | 3 | 1 | 0 |
2015-01-01 04:00:00 | 17081 | 2015-01-01 | 3 | 4 | 1 | 0 |
def visualize_by_date(df):
def view_images(date):
data = df.loc[df['date'] == date]['cnt']
ax = data.plot();
ax.set_title(f'date is {date}')
interact(view_images, date=list(np.sort(df['date'].unique())))
visualize_by_date(df)
!pip3 install pydeck
!jupyter nbextension install --sys-prefix --symlink --overwrite --py pydeck
!jupyter nbextension enable --sys-prefix --py pydeck
# 그 후 노트북, 터미널 껐다 키기
import pydeck as pdk
%%time
agg_query = """
WITH base_data AS
(
SELECT
nyc_taxi.*,
pickup.zip_code as pickup_zip_code,
pickup.internal_point_lat as pickup_zip_code_lat,
pickup.internal_point_lon as pickup_zip_code_lon,
dropoff.zip_code as dropoff_zip_code,
dropoff.internal_point_lat as dropoff_zip_code_lat,
dropoff.internal_point_lon as dropoff_zip_code_lon
FROM (
SELECT *
FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015`
WHERE
EXTRACT(MONTH from pickup_datetime) = 1
and pickup_latitude <= 90 and pickup_latitude >= -90
and dropoff_latitude <= 90 and dropoff_latitude >= -90
) AS nyc_taxi
JOIN (
SELECT zip_code, state_code, state_name, city, county, zip_code_geom, internal_point_lat, internal_point_lon
FROM `bigquery-public-data.geo_us_boundaries.zip_codes`
WHERE state_code='NY'
) AS pickup
ON ST_CONTAINS(pickup.zip_code_geom, st_geogpoint(pickup_longitude, pickup_latitude))
JOIN (
SELECT zip_code, state_code, state_name, city, county, zip_code_geom, internal_point_lat, internal_point_lon
FROM `bigquery-public-data.geo_us_boundaries.zip_codes`
WHERE state_code='NY'
) AS dropoff
ON ST_CONTAINS(dropoff.zip_code_geom, st_geogpoint(dropoff_longitude, dropoff_latitude))
)
SELECT
DATETIME_TRUNC(pickup_datetime, hour) AS pickup_hour,
pickup_zip_code,
pickup_zip_code_lat,
pickup_zip_code_lon,
dropoff_zip_code,
dropoff_zip_code_lat,
dropoff_zip_code_lon,
COUNT(*) AS cnt
FROM base_data
WHERE pickup_datetime <= '2015-01-03'
GROUP BY 1,2,3,4,5,6,7
HAVING cnt >= 20
"""
agg_df = pd.read_gbq(query=agg_query, dialect='standard', project_id=project_id)
CPU times: user 384 ms, sys: 20.6 ms, total: 404 ms Wall time: 1min 29s
# 데이터 준비
agg_df.tail(7)
pickup_hour | pickup_zip_code | pickup_zip_code_lat | pickup_zip_code_lon | dropoff_zip_code | dropoff_zip_code_lat | dropoff_zip_code_lon | cnt | |
---|---|---|---|---|---|---|---|---|
8450 | 2015-01-01 17:00:00 | 11430 | 40.646809 | -73.786169 | 10036 | 40.759254 | -73.989827 | 24 |
8451 | 2015-01-01 12:00:00 | 11430 | 40.646809 | -73.786169 | 10036 | 40.759254 | -73.989827 | 20 |
8452 | 2015-01-01 20:00:00 | 11430 | 40.646809 | -73.786169 | 10016 | 40.745221 | -73.978294 | 20 |
8453 | 2015-01-02 16:00:00 | 11430 | 40.646809 | -73.786169 | 10036 | 40.759254 | -73.989827 | 27 |
8454 | 2015-01-02 21:00:00 | 11430 | 40.646809 | -73.786169 | 10036 | 40.759254 | -73.989827 | 22 |
8455 | 2015-01-02 18:00:00 | 11430 | 40.646809 | -73.786169 | 10016 | 40.745221 | -73.978294 | 22 |
8456 | 2015-01-01 18:00:00 | 11430 | 40.646809 | -73.786169 | 10036 | 40.759254 | -73.989827 | 22 |
import datetime
default_df = agg_df.loc[agg_df['pickup_hour'].dt.date == datetime.date(2015, 1, 1)]
# Layer 선택
arc_layer = pdk.Layer(
'ArcLayer',
default_df,
get_source_position='[pickup_zip_code_lon, pickup_zip_code_lat]',
get_target_position='[dropoff_zip_code_lon, dropoff_zip_code_lat]',
get_source_color='[255, 255, 120]',
get_target_color='[255, 0, 0]',
width_units='meters',
get_width="cnt",
pickable=True,
auto_highlight=True,
)
# ViewState 정의 어떤 시각에서 볼 것인지(좌표, Zoom Level, bearing 등)
nyc_center = [-73.9808, 40.7648]
view_state = pdk.ViewState(longitude=nyc_center[0], latitude=nyc_center[1], zoom=11)
# Deck 객체 생성하며 Layer, ViewState 연결 + Tooltip 추가
r = pdk.Deck(layers=[arc_layer], initial_view_state=view_state,
tooltip={
'html': '<b>count:</b> {cnt}',
'style': {
'color': 'white'
}
}
)
# 렌더링
r.show()
def visualize_demand(date=list(np.sort(agg_df['pickup_hour'].dt.date.unique())),
hour=widgets.IntSlider(min=0, max=23, step=1, value=0)):
filter_df = agg_df[(agg_df['pickup_hour'].dt.date == date) & (agg_df['pickup_hour'].dt.hour == hour)].to_dict(orient='records')
# arc_layer의 data를 바꿔치기
arc_layer.data = filter_df
r.update() # update가 핵심
r.show()
display(interact(visualize_demand))
<function __main__.visualize_demand(date=[datetime.date(2015, 1, 1), datetime.date(2015, 1, 2)], hour=IntSlider(value=0, description='hour', max=23))>
Colab 코드 제출해주세요(진지함)