Staley Creek Temperature Analysis

This is a Jupyter Notebook documenting my data analysis workflow in python and thought process when describing temperature data pre and post Sage 0 restoration in lower Staley Creek.

In [1]:
##Pandas handles all the data transformations and table structuring
import pandas as pd

##Numpy handles all the math
import numpy as np 

##matplotlid handles all the graphing
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

#sklean handels the stats
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

##seaborn handels some additional plotting and stats
import seaborn as sns

##This line will make all the plots immediately draw within the notebook.
%matplotlib inline

##Check for Python version >3.0
import sys
print(sys.version)

##Available memory
import psutil
psutil.virtual_memory()
3.7.3 (default, Apr 24 2019, 13:20:13) [MSC v.1915 32 bit (Intel)]
Out[1]:
svmem(total=34270703616, available=27421204480, percent=20.0, used=6849499136, free=27421204480)
In [2]:
df = pd.read_csv("temp/AllStaleyTempData_2016-2018.csv")
In [3]:
df['Date'] = pd.to_datetime(df['Date'])

df['Time'] = pd.to_datetime(df['Time'], format='%H:%M')
df['Time'] = df['Time'].dt.round("30min")
df['Time'] = df['Time'].dt.time

df['Distance'] = df['Distance'].astype(int)

Time Series

In [4]:
df_16 = df[df.Year == 2016]
for location, group in df_16.groupby('Location'):
    group.plot(y='Temp', use_index= False, title='2016' + ' ' + location, figsize = (12,4))
In [5]:
df_17 = df[df.Year == 2017]
for location, group in df_17.groupby('Location'):
    group.plot(y='Temp', use_index= False, title= '2017' + ' ' + location, figsize = (12,4))
In [6]:
df_18 = df[df.Year == 2018]
for location, group in df_18.groupby('Location'):
    group.plot(y='Temp', use_index= False, title='2018' + ' ' + location, figsize = (12,4))

2018 Hottest Days Analysis (July- September)

In [7]:
df_2018_hot = (df['Date'] >= '07//2018') & (df['Date'] <= '09//2018')
df_2018_hot = df.loc[df_2018_hot]
df_2018_hot.head()
Out[7]:
Date Time Year Patch Temp Distance Location
12384 2018-07-01 00:00:00 2018 1 13.0 632 Restoration
12385 2018-07-01 00:30:00 2018 1 13.0 632 Restoration
12386 2018-07-01 01:00:00 2018 1 13.0 632 Restoration
12387 2018-07-01 01:30:00 2018 1 12.5 632 Restoration
12388 2018-07-01 02:00:00 2018 1 12.5 632 Restoration
In [8]:
for location, group in df_2018_hot.groupby('Location'):
    y = group['Temp']
    x = group.index.get_level_values(0)
    fig, ax = plt.subplots(figsize=(12,4), dpi=80)
    ax.plot(x, y)
    ax.set_xticks(())
    #ax.set_xticklabels(rotation = 45, ha="center")
    fig.autofmt_xdate()
    ax.set_title( '2018' + ' ' + location)
In [9]:
gp_patch_hot = pd.DataFrame()

for patch, group in df_2018_hot.groupby("Patch"):
    if gp_patch_hot.empty:
        gp_patch_hot = group.set_index(['Date', 'Time'])[["Temp"]].rename(columns={"Temp" :patch})
    else:
        gp_patch_hot = gp_patch_hot.join(group.set_index(['Date', 'Time'])[["Temp"]].rename(columns={"Temp" :patch}))
        
gp_patch_hot.head(4)
Out[9]:
0 1 7 9 10 13 14 18 20 31 ... 77 79 82 84 93 94 95 96 97 101
Date Time
2018-07-01 00:00:00 13.594 13.0 NaN 13.0 13.0 13.0 13.0 13.0 13.0 12.5 ... 13.0 13.0 13.0 13.0 13.0 13.5 13.0 13.0 13.0 12.364
01:00:00 13.305 13.0 NaN 13.0 12.5 12.5 12.5 13.0 12.5 12.5 ... 13.0 12.5 13.0 12.5 13.0 13.5 12.5 12.5 13.0 12.050
02:00:00 13.016 12.5 NaN 12.5 12.5 12.5 12.5 12.5 12.0 12.0 ... 12.5 12.0 12.5 12.0 12.5 13.0 12.0 12.0 12.5 11.734
03:00:00 12.751 12.0 NaN 12.0 12.0 12.0 12.0 12.5 12.0 11.5 ... 12.0 12.0 12.5 12.0 12.0 13.0 12.0 12.0 12.5 11.467

4 rows × 35 columns

In [10]:
p0_median = gp_patch_hot[0].median()
p101_median = gp_patch_hot[101].median()

gp_patch_hot.boxplot(figsize=(18,6))
plt.axhline(p0_median, color='r')
plt.axhline(p101_median, color='b')
Out[10]:
<matplotlib.lines.Line2D at 0x15ecc7b0>
In [11]:
gp_patch_hot_day = gp_patch_hot.groupby(level = 0).describe()
gp_patch_hot_day.head()
Out[11]:
0 1 ... 97 101
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
Date
2018-07-01 24.0 14.142083 1.771437 11.783 12.68450 13.8340 15.77800 16.963 24.0 13.770833 ... 16.000 18.0 24.0 12.448583 1.580387 10.394 10.98000 12.2070 13.93600 14.984
2018-07-02 24.0 13.656875 1.549930 11.637 12.32800 13.3290 15.01375 16.225 24.0 13.312500 ... 15.000 17.0 24.0 12.005417 1.344546 10.198 10.82750 11.8315 13.03425 14.314
2018-07-03 24.0 13.036417 1.815609 10.467 11.42475 12.9915 14.46875 15.819 24.0 12.500000 ... 14.625 17.0 24.0 11.141250 1.652763 8.841 9.58925 11.1385 12.61825 13.546
2018-07-04 24.0 13.752500 1.627282 11.686 12.27950 13.6055 15.15650 16.320 24.0 13.312500 ... 15.500 17.0 24.0 11.970417 1.410549 10.247 10.66300 11.5400 13.29300 14.026
2018-07-05 24.0 14.320458 1.876224 11.856 12.71475 14.1575 15.96250 17.201 24.0 13.979167 ... 16.125 18.5 24.0 12.613792 1.701629 10.418 11.00500 12.1705 14.24775 15.127

5 rows × 280 columns

7 Day Rolling Max

In [12]:
gp_patch_hot_7_max = gp_patch_hot_day.xs('max', axis = 1, level = 1, drop_level = False).rolling(7).max().plot(legend = False)

7 Day Rolling Mean

In [13]:
gp_patch_hot_day.xs('mean', axis = 1, level = 1, drop_level = False).rolling(7).mean().plot(legend = False)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1766ebd0>