Removing Rows from .csv file based on text criteria
*Requires installation of Pandas Package
#Description: Filter out rows with text criteria in a csv file using Pandas in Python. #This code will delete all rows with your given text criteria (‘text criteria’) in your selected column (MyColumn) #Save a new csv file excluding deleted rows. #Also keep in mind that Python is case sensitive for text criteria (and in general). #Note: this code is for filtering by text criteria only. To filter by numerical value or range, see Pandas Online Documentation import pandas as pd #Define data and load your csv file: data = pd.read_csv('full/directory/My_File.csv') #Filter rows based on text criteria: filtered_data = data[data.MyColumn != 'text criteria'] #Save new filtered csv file: filtered_data.to_csv('My_New_file.csv') #Be sure to give your new file a different name to prevent overwriting your original data! #Open your new filtered csv and see the unwanted rows have been deleted.
Change Timestamp Format to YYMMDDHH
*Requires Pandas and Datetime packages installed
import pandas as pd
# pd is a nickname for pandas, easier to type
from datetime import datetime
#datetime is a module that contains lots of functions, but we only #need the function ‘datetime’ for this
df = pd.read_csv(‘My_File-csv’)
# This stores your data in a dataframe called df.
def change_time_format(df):
“”” This function takes the dataframe data and changes the date and time format to YYMMHHSS
“””
times = df.StartTime.values
# NOTE: If you try doing type(times) , and then type(times[0]) , you will see
# that times is a numpy array (‘numpy.ndarray’), which contains strings (‘str’)
# which we need to turn into integers (seconds).
# Googling shows me this: http://stackoverflow.com/questions/466345/converting-string-into-datetime
# and also more detail: https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior
# Careful! Need to use %H instead of %I because they are using 24-clock , not 12 hour clock.
#datetimes = [datetime.strptime(t, ‘%m/%d/%Y %H:%M:%S’) for t in times]
datetimes = [datetime.strptime(t, ‘%m/%d/%y %H:%M’) for t in times]
df[‘datetimes’] = datetimes
df[‘daynumber’] = [datetime.strftime(d, ‘%j’) for d in datetimes]
df[‘year’] = [date.year for date in datetimes]
df[‘hour’] = [date.hour for date in datetimes]
df[‘month’] = [date.month for date in datetimes]
df[‘day’] = [date.day for date in datetimes]
df[‘minute’] = [date.minute for date in datetimes]
df[‘second’] = [date.second for date in datetimes]
# The above will take a while to process.
Calculate Average Volume and Duration per House per Hour
*Requires Pandas, Numpy and TQDM packages installed
import pandas as pd
import numpy as np
from tqdm import tqdm
df = pd.read_csv(‘REU2016_data_indoor_usage.csv’)
house_id_list = np.unique(df[‘House’])
hour_list = range(0,24)
def get_avg(hour, house_id):
H = df[df[‘House’]==house_id]
dayswithdata = np.unique(H[H[‘hour’]==hour].day)
Volumes = [sum(H[ (H.hour==hour) & (H.day==day)].Volume) for day in dayswithdata]
Durations = [sum(H[ (H.hour==hour) & (H.day==day)].Duration) for day in dayswithdata]
mean_Volume = np.mean(Volumes)
mean_Duration = np.mean(Durations)
if np.isnan(mean_Volume):
mean_Volume = 0
if np.isnan(mean_Duration):
mean_Duration = 0
return mean_Volume, mean_Duration
def make_dframe():
allVolumes = []
allDurations = []
allHours = []
allHouse = []
for house_id in tqdm(house_id_list):
vol_dur = [get_avg(hour, house_id) for hour in hour_list]
Volumes = [v for v,d in vol_dur]
Durations = [d for v,d in vol_dur]
Hours = hour_list
House = [house_id]*len(hour_list)
allVolumes.extend(Volumes)
allDurations.extend(Durations)
allHours.extend(Hours)
allHouse.extend(House)
newdf = pd.DataFrame({‘Hour’:allHours, ‘House’:allHouse,
‘Avg_Volume’:allVolumes, ‘Avg_Duration’:allDurations})
return newdf
#return allHours, allHouse, allVolumes, allDurations
Calculating Average of Individual House Averages
*Requires installation of Pandas
#This code uses the Pandas Group By Function to take average data of a column based on a criteria
#For this code we found average Volume and Duration by Hour, where Volume, Duration and Hour were individual columns in the csv file
import pandas as pd
data = pd.read_csv(‘REU2016_volume_and_duration_by_hour.csv’)
#Get Average Volume by hour over all houses:
data.groupby(‘Hour’).Volume.agg([‘count’,’min’,’max’,’mean’])
data_avg_vol = data.groupby(‘Hour’).Volume.agg([‘count’,’min’,’max’,’mean’])
data_avg_vol.to_csv(‘REU2016_volume_by_hour_avg.csv’)
#Creates a new csv file with average volume over all houses by hour
#Get Average Duration by hour over all houses:
data.groupby(‘Hour’).Duration.agg([‘count’,’min’,’max’,’mean’])
data_avg_dur = data.groupby(‘Hour’).Duration.agg([‘count’,’min’,’max’,’mean’])
data_avg_dur.to_csv(‘REU2016_duration_by_hour_avg.csv’)
#Creates a new csv file with average duration over all houses by hour
Find Average Number of Events per Hour for All Houses (Pandas Method)
*Requires installation of Pandas
import pandas as pd
all_data = pd.read_csv(‘REU2016_data_indoor_usage.csv’)
all_data.groupby(‘hour’).Event.count()
#This will print the total number of events by hour. In Excel, divide #these numbers by the number of houses (756) times the number of days #(14).
Find Average Number of Events per Hour for All Houses (Numpy Method)
""" Obtains the average number of events for a house and a given hour over the observed time period. """ import pandas as pd from tqdm import tqdm import numpy as np df = pd.read_csv('REU2016_data_indoor_usage.csv') def get_avg_count(hour, house_id): H = df[df['House']==house_id] HH = H[H['hour']==hour] event_counts = [len(HH[HH['day']==i]) for i in range(1,32)] numdays_with_data = sum([i!=0 for i in event_counts]) # assume that there were observations for 14 days total, and that # if there were no observations in a given day (ie 0 events) then this # is not reported total_observation_days = 14.0 meancount = sum(event_counts)/total_observation_days return meancount def make_dframe(): house_id_list = np.unique(df['House']) hour_list = range(0,24) allCounts = [] allHouse = [] allHours = [] print('Making dataframe of event count data') for house_id in tqdm(house_id_list): Counts = [get_avg_count(hour, house_id) for hour in hour_list] House = [house_id]*len(hour_list) Hours = hour_list allCounts.extend(Counts) allHouse.extend(House) allHours.extend(Hours) newdf = pd.DataFrame({'Hour':allHours, 'House':allHouse, 'Event_count': allCounts}) newdf.to_csv('Event_count_avg_per_house_by_hour.csv') return newdf data = pd.read_csv('Event_count_avg_per_house_by_hour.csv') data_avg_event_count = data.groupby(‘Hour’).Event_count.agg([‘count’,’min’,’max’,’mean’]) data_avg_event_count.to_csv(‘Event_count_avg_by_house.csv’) #This will produce a new csv sheet with the hour, number of averages #(756), min, max and mean of event counts for all homes. The mean #should be the same result as the Pandas method.
Converting Microsoft Access file to CSV file in Mac OSX computer
#In the terminal: brew install mdbtools #This installs the package mdbtools #to convert MS Access file to csv: mdb-export database_name.accdb table_name > outputname.csv
Bokeh Code for Visualizing Residential End Uses of Water 2016 Data
from os.path import join, dirname import datetime import numpy as np import pandas as pd from bokeh.io import curdoc from bokeh.layouts import row, column from bokeh.models import ColumnDataSource, DataRange1d, Select from bokeh.models.widgets import CheckboxGroup, Button from bokeh.palettes import Blues4 from bokeh.plotting import figure from collections import OrderedDict months = OrderedDict([ ('January',1), ('February',2), ('March',3), ('April',4), ('May',5), ('June',6), ('July',7), ('August',8), ('September',9), ('October',10), ('November',11), ('December',12) ]) month_list = months.keys() month_list = ['All months'] + month_list year_list = ['All years', '2012', '2013'] #day_list = [str(i) for i in range(1,32) ] weekday_list = ['Both', 'Weekday', 'Weekend'] # Initialize with these strings: year = '2012' month = 'March' #day = '1' weekday= 'Both' df = pd.read_csv('REU16_data_for_bokeh.csv') event_list = np.unique(df.Event.values) event_list = [i for i in event_list if isinstance(i,str)] # Because there are some values of nan events = event_list[0:2] #house_list = list( np.unique(df.House.values) ) #house_list = ['All'] + house_list region_list = list( np.unique(df.Region.values) ) region_list = ['All'] + region_list region = region_list[0] # 'All' is the default. year_select = Select(value=year, title='Year', options = year_list) month_select = Select(value=month, title='Month', options = month_list) #day_select = Select(value=day, title = 'Day', options = day_list) weekday_select = Select(value=weekday, title = 'Weekday or weekend', options = weekday_list) #house_select = Select(value=house, title = 'House', options = house_list) region_select = Select(value=region, title = 'Region', options = region_list) #event_select = Select(value=event, title = 'Event', options = event_list) event_group = CheckboxGroup(labels=event_list, active=[0, 1]) select_all = Button(label="Select all events") clear_all = Button(label = "Clear all events") def get_dataset(src, year=year, month=month, weekday=weekday, events=events, region=region): if weekday=='Weekday': weekdayn = [1] elif weekday=='Weekend': weekdayn = [0] elif weekday=='Both': weekdayn = [0,1] else: raise ValueError("weekday must be Weekday or Weekend") if month == 'All months': if year=='All years': if region=='All': df = src[ (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy() else: df = src[ (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy() else: year = int(year) if region=='All': df = src[ (src.year == year) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy() else: df = src[ (src.year == year) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy() else: month = months[month] if year=='All years': if region=='All': df = src[ (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy() else: df = src[ (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy() else: year = int(year) if region=='All': df = src[ (src.year == year) & (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) ].copy() else: df = src[ (src.year == year) & (src.month == month) & (src.Weekday.isin(weekdayn)) & (src.Event.isin(events)) & (src.Region ==region) ].copy() hours = df['hour'].values volume = df['Volume'].values return dict(x=hours, y=volume) def make_plot(source, sourceavg, title): #plot = figure(x_axis_type="datetime", plot_width=800, tools="", toolbar_location=None) plot = figure(plot_width=800, plot_height=500, x_range=(-0.5, 23.5), tools = "pan,wheel_zoom,box_zoom,save,reset,hover" ) plot.title.text = title plot.vbar(x='x', bottom = 0, width = 0.5, top='y', source = sourceavg, color="#CAB2D6") plot.circle('x','y', source=source) # fixed attributes plot.xaxis.axis_label = "Hour" plot.yaxis.axis_label = "Volume (in Gallons)" plot.axis.axis_label_text_font_style = "bold" #plot.x_range = DataRange1d(range_padding=0.0) plot.grid.grid_line_alpha = 0.3 return plot def update_plot(attrname, old, new): year = year_select.value month = month_select.value #day = day_select.value weekday = weekday_select.value region = region_select.value #event = event_select.value events = event_group.active events = [event_list[i] for i in events] plot.title.text = "Average volume by hour, " + region + ", " + month + ' ' + year updated_data = get_dataset(df, year=year, month=month, weekday=weekday, events = events, region = region ) source.data = updated_data #dict(x=source.data['hour'], y = source.data['Volume']) #updated_data #dict(x=x, y=y) sourceavg.data = get_avgdata(source) def get_avgdata(source): # get mean values for bar plot volume = source.data['y'] vv=[volume[np.where(source.data['x']==h)[0]] for h in range(24)] meanvolumes = [0]*24 for i in range(24): if len(vv[i])==0: meanvolumes[i] = 0 else: meanvolumes[i] = np.mean(vv[i]) #[np.mean(v) for v in vv if len(v)!=0] return dict(x=range(24), y=meanvolumes) def activate_all_events(): event_group.active = range(len(event_list)) def clear_all_events(): event_group.active = [] datadict = get_dataset(df, year = year, month = month, weekday = weekday, events = events, region = region ) source = ColumnDataSource(datadict) sourceavg = ColumnDataSource(get_avgdata(source)) plot = make_plot(source, sourceavg, "Average volume by hour, " + month + ' ' + year) year_select.on_change('value', update_plot) month_select.on_change('value', update_plot) weekday_select.on_change('value', update_plot) region_select.on_change('value', update_plot) #event_select.on_change('value', update_plot) event_group.on_change('active',update_plot) select_all.on_click(activate_all_events) clear_all.on_click(clear_all_events) curdoc().add_root(row( column(year_select, month_select, weekday_select, region_select, event_group, select_all, clear_all), plot )) curdoc().title = "Residential End Uses of Water 2016 Data"