Residential End Uses of Water 2016, Python Code

Removing Rows from .csv file based on text criteria

*Requires installation of Pandas Package

#Description: Filter out rows with text criteria in a csv file using Pandas in Python.
#This code will delete all rows with your given text criteria (‘text criteria’) in your selected column (MyColumn)
#Save a new csv file excluding deleted rows.
#Also keep in mind that Python is case sensitive for text criteria (and in general).
#Note: this code is for filtering by text criteria only. To filter by numerical value or range, see Pandas Online Documentation

import pandas as pd
#Define data and load your csv file:

data = pd.read_csv('full/directory/My_File.csv')

#Filter rows based on text criteria:

filtered_data = data[data.MyColumn != 'text criteria']

#Save new filtered csv file:


#Be sure to give your new file a different name to prevent overwriting your original data!

#Open your new filtered csv and see the unwanted rows have been deleted.

Change Timestamp Format to YYMMDDHH

*Requires Pandas and Datetime packages installed


import pandas as pd

# pd is a nickname for pandas, easier to type

from datetime import datetime

#datetime is a module that contains lots of functions, but we only #need the function ‘datetime’ for this


df = pd.read_csv(‘My_File-csv’)    

# This stores your data in a dataframe called df.


def change_time_format(df):

   “”” This function takes the dataframe data and changes the date and time format to YYMMHHSS


   times = df.StartTime.values

   # NOTE: If you try doing type(times) , and then type(times[0]) , you will see

   # that times is a numpy array (‘numpy.ndarray’), which contains strings (‘str’)

   # which we need to turn into integers (seconds).

   # Googling shows me this:

   # and also more detail:

   # Careful!  Need to use %H instead of %I because they are using 24-clock , not 12 hour clock.


   #datetimes = [datetime.strptime(t, ‘%m/%d/%Y %H:%M:%S’) for t in times]

   datetimes = [datetime.strptime(t, ‘%m/%d/%y %H:%M’) for t in times]


   df[‘datetimes’] = datetimes

   df[‘daynumber’] = [datetime.strftime(d, ‘%j’) for d in datetimes]

   df[‘year’] = [date.year for date in datetimes]

   df[‘hour’] = [date.hour for date in datetimes]


   df[‘month’] = [date.month for date in datetimes]

   df[‘day’] = [ for date in datetimes]

   df[‘minute’] = [date.minute for date in datetimes]

   df[‘second’] = [date.second for date in datetimes]


   # The above will take a while to process.


Calculate Average Volume and Duration per House per Hour

*Requires Pandas, Numpy and TQDM packages installed


import pandas as pd

import numpy as np

from tqdm import tqdm


df = pd.read_csv(‘REU2016_data_indoor_usage.csv’)


house_id_list = np.unique(df[‘House’])

hour_list = range(0,24)


def get_avg(hour, house_id):

   H = df[df[‘House’]==house_id]

   dayswithdata = np.unique(H[H[‘hour’]==hour].day)

   Volumes = [sum(H[ (H.hour==hour) & (].Volume) for day in dayswithdata]

   Durations = [sum(H[ (H.hour==hour) & (].Duration) for day in dayswithdata]


   mean_Volume = np.mean(Volumes)

   mean_Duration = np.mean(Durations)


   if np.isnan(mean_Volume):

       mean_Volume = 0

   if np.isnan(mean_Duration):

       mean_Duration = 0


   return mean_Volume, mean_Duration


def make_dframe():


   allVolumes = []

   allDurations = []

   allHours = []

   allHouse = []


   for house_id in tqdm(house_id_list):

       vol_dur = [get_avg(hour, house_id) for hour in hour_list]

       Volumes = [v for v,d in vol_dur]

       Durations = [d for v,d in vol_dur]

       Hours = hour_list

       House = [house_id]*len(hour_list)







   newdf = pd.DataFrame({‘Hour’:allHours, ‘House’:allHouse,

                       ‘Avg_Volume’:allVolumes, ‘Avg_Duration’:allDurations})

   return newdf

   #return allHours, allHouse, allVolumes, allDurations


Calculating Average of Individual House Averages

*Requires installation of Pandas


#This code uses the Pandas Group By Function to take average data of a column based on a criteria

#For this code we found average Volume and Duration by Hour, where Volume, Duration and Hour were individual columns in the csv file


import pandas as pd


data = pd.read_csv(‘REU2016_volume_and_duration_by_hour.csv’)


#Get Average Volume by hour over all houses:




data_avg_vol = data.groupby(‘Hour’).Volume.agg([‘count’,’min’,’max’,’mean’])



#Creates a new csv file with average volume over all houses by hour


#Get Average Duration by hour over all houses:




data_avg_dur = data.groupby(‘Hour’).Duration.agg([‘count’,’min’,’max’,’mean’])



#Creates a new csv file with average duration over all houses by hour


Find Average Number of Events per Hour for All Houses (Pandas Method)

*Requires installation of Pandas

import pandas as pd


all_data = pd.read_csv(‘REU2016_data_indoor_usage.csv’)




#This will print the total number of events by hour. In Excel, divide #these numbers by the number of houses (756) times the number of days #(14).

Find Average Number of Events per Hour for All Houses (Numpy Method)


Obtains the average number of events for a house and a given hour over the

observed time period.


import pandas as pd

from tqdm import tqdm

import numpy as np

df = pd.read_csv('REU2016_data_indoor_usage.csv')

def get_avg_count(hour, house_id):

    H = df[df['House']==house_id]

    HH = H[H['hour']==hour]

    event_counts = [len(HH[HH['day']==i]) for i in range(1,32)]

    numdays_with_data = sum([i!=0 for i in event_counts])

    # assume that there were observations for 14 days total, and that

    # if there were no observations in a given day (ie 0 events) then this

    # is not reported

    total_observation_days = 14.0

    meancount = sum(event_counts)/total_observation_days

    return meancount

def make_dframe():

    house_id_list = np.unique(df['House'])

    hour_list = range(0,24)

    allCounts = []

    allHouse = []

    allHours = []

    print('Making dataframe of event count data')

    for house_id in tqdm(house_id_list):

        Counts = [get_avg_count(hour, house_id) for hour in hour_list]

        House = [house_id]*len(hour_list)

        Hours = hour_list




    newdf = pd.DataFrame({'Hour':allHours, 'House':allHouse,

                          'Event_count': allCounts})


    return newdf

data = pd.read_csv('Event_count_avg_per_house_by_hour.csv')

data_avg_event_count = data.groupby(‘Hour’).Event_count.agg([‘count’,’min’,’max’,’mean’])


#This will produce a new csv sheet with the hour, number of averages #(756), min, max and mean of event counts for all homes. The mean #should be the same result as the Pandas method.

Converting Microsoft Access file to CSV file in Mac OSX computer

#In the terminal:

brew install mdbtools

#This installs the package mdbtools

#to convert MS Access file to csv:

mdb-export database_name.accdb table_name > outputname.csv

Bokeh Code for Visualizing Residential End Uses of Water 2016 Data

from os.path import join, dirname

import datetime

import numpy as np

import pandas as pd

from import curdoc

from bokeh.layouts import row, column

from bokeh.models import ColumnDataSource, DataRange1d, Select

from bokeh.models.widgets import CheckboxGroup, Button

from bokeh.palettes import Blues4

from bokeh.plotting import figure

from collections import OrderedDict

months = OrderedDict([














month_list = months.keys()

month_list = ['All months'] + month_list

year_list = ['All years', '2012', '2013']

#day_list =  [str(i) for i in range(1,32) ]

weekday_list = ['Both', 'Weekday', 'Weekend']

# Initialize with these strings:

year = '2012'

month = 'March'

#day = '1'

weekday= 'Both'

df = pd.read_csv('REU16_data_for_bokeh.csv')

event_list = np.unique(df.Event.values)

event_list = [i for i in event_list if isinstance(i,str)] # Because there are some values of nan

events = event_list[0:2]

#house_list = list( np.unique(df.House.values) )

#house_list = ['All'] + house_list

region_list = list( np.unique(df.Region.values) )

region_list = ['All'] + region_list

region = region_list[0] # 'All' is the default.

year_select = Select(value=year, title='Year', options = year_list)

month_select = Select(value=month, title='Month', options = month_list)

#day_select = Select(value=day, title = 'Day', options = day_list)

weekday_select = Select(value=weekday, title = 'Weekday or weekend', options = weekday_list)

#house_select = Select(value=house, title = 'House', options = house_list)

region_select = Select(value=region, title = 'Region', options = region_list)

#event_select = Select(value=event, title = 'Event', options = event_list)

event_group = CheckboxGroup(labels=event_list, active=[0, 1])

select_all = Button(label="Select all events")

clear_all = Button(label = "Clear all events")

def get_dataset(src, year=year, month=month, weekday=weekday, events=events, region=region):

    if weekday=='Weekday':

        weekdayn = [1]

    elif weekday=='Weekend':

        weekdayn = [0]

    elif weekday=='Both':

        weekdayn = [0,1]


        raise ValueError("weekday must be Weekday or Weekend")

    if month == 'All months':

        if year=='All years':

            if region=='All':

                df = src[ (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy()


                df = src[ (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy()


            year = int(year)

            if region=='All':

                df = src[ (src.year == year) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy()


                df = src[ (src.year == year) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy()


        month = months[month]

        if year=='All years':

            if region=='All':

                df = src[ (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy()


                df = src[ (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy()


            year = int(year)

            if region=='All':

                df = src[ (src.year == year) & (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy()


                df = src[ (src.year == year) & (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy()

    hours = df['hour'].values

    volume = df['Volume'].values

    return dict(x=hours, y=volume)

def make_plot(source, sourceavg, title):

    #plot = figure(x_axis_type="datetime", plot_width=800, tools="", toolbar_location=None)

    plot = figure(plot_width=800, plot_height=500, x_range=(-0.5, 23.5), tools = "pan,wheel_zoom,box_zoom,save,reset,hover" )

    plot.title.text = title

    plot.vbar(x='x', bottom = 0, width = 0.5, top='y', source = sourceavg, color="#CAB2D6")'x','y', source=source)

    # fixed attributes

    plot.xaxis.axis_label = "Hour"

    plot.yaxis.axis_label = "Volume (in Gallons)"

    plot.axis.axis_label_text_font_style = "bold"

    #plot.x_range = DataRange1d(range_padding=0.0)

    plot.grid.grid_line_alpha = 0.3

    return plot

def update_plot(attrname, old, new):

    year  = year_select.value

    month = month_select.value

    #day = day_select.value

    weekday = weekday_select.value

    region = region_select.value

    #event = event_select.value

    events =

    events = [event_list[i] for i in events]

    plot.title.text = "Average volume by hour, "  + region +  ", " + month + ' ' + year

    updated_data = get_dataset(df, year=year, month=month, weekday=weekday, events = events, region = region  ) = updated_data #dict(['hour'], y =['Volume']) #updated_data #dict(x=x, y=y) = get_avgdata(source)

def get_avgdata(source):

    # get mean values for bar plot

    volume =['y']

    vv=[volume[np.where(['x']==h)[0]] for h in range(24)]

    meanvolumes = [0]*24

    for i in range(24):

        if len(vv[i])==0:

            meanvolumes[i] = 0


            meanvolumes[i] = np.mean(vv[i])

            #[np.mean(v) for v in vv if len(v)!=0]

    return dict(x=range(24), y=meanvolumes)

def activate_all_events(): = range(len(event_list))

def clear_all_events(): = []

datadict = get_dataset(df, year = year, month = month, weekday = weekday, events = events, region = region )

source = ColumnDataSource(datadict)

sourceavg = ColumnDataSource(get_avgdata(source))

plot = make_plot(source, sourceavg, "Average volume by hour, " + month + ' ' + year)

year_select.on_change('value', update_plot)

month_select.on_change('value', update_plot)

weekday_select.on_change('value', update_plot)

region_select.on_change('value', update_plot)

#event_select.on_change('value', update_plot)





        column(year_select, month_select, weekday_select,

               region_select, event_group, select_all, clear_all),

        plot ))

curdoc().title = "Residential End Uses of Water 2016 Data"