Skip to content
Snippets Groups Projects
Commit 980e2ddf authored by Ben Anderson's avatar Ben Anderson
Browse files

moved code tree

parent 8a1fee1b
No related branches found
No related tags found
No related merge requests found
.Rproj.user
.Rhistory
.RData
...@@ -55,9 +55,9 @@ set more off ...@@ -55,9 +55,9 @@ set more off
clear all clear all
capture log close capture log close _all
log using "$logpath/CER-data-processing-samples-$version.smcl", replace log using "$logpath/CER-data-processing-samples-$version.smcl", replace name(main)
timer clear timer clear
...@@ -72,11 +72,43 @@ use "$dpath/processed/SME and Residential allocations.dta" ...@@ -72,11 +72,43 @@ use "$dpath/processed/SME and Residential allocations.dta"
* merge pre-trial residential survey data * merge pre-trial residential survey data
merge 1:1 ID using "$dpath/processed/Smart meters Residential pre-trial survey data-$version.dta" merge 1:1 ID using "$dpath/processed/Smart meters Residential pre-trial survey data-$version.dta"
drop _merge
* keep the residential IDs only * keep the residential IDs only
keep if sample == 1 keep if sample == 1
* keep basic info * set labels properly
lab var ba_npeople "Number of residents"
lab var ba_nchildren "Number of chidlren"
lab var ba_nadults "Number of adults"
lab var ba_floorarea "Self-reported floor area in m2"
lab var ba_empl "HRP employment status"
log off main
log using "$dpath/processed/CER-pre-trial-survey-residential-sample-$version-codebook.smcl", ///
replace name(cb)
desc
codebook
log off cb
log on main
* create a detailed version of the survey
outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-$version.csv", comma replace
* keep basic info to form a simplified version of the survey
keep ID sample res_stim res_tariff ba_* keep ID sample res_stim res_tariff ba_*
log off main
log using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version-codebook.smcl", ///
replace name(cb_wf)
desc
codebook
log off cb_wf
log on main
save "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.dta", replace save "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.dta", replace
outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.csv", comma replace outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$version.csv", comma replace
...@@ -84,7 +116,7 @@ outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$ver ...@@ -84,7 +116,7 @@ outsheet using "$dpath/processed/CER-pre-trial-survey-residential-sample-wf-$ver
************************************ ************************************
************************************ ************************************
* Create 4 samples - Oct/Dec 2009 and Oct/Dec 2010 * Create 2 samples - Oct 2009-2010 and Dec 2009-2010
* Make sure collect all half hours * Make sure collect all half hours
local Oct2009 "tin(01oct2009 00:00, 31oct2009 23:45)" local Oct2009 "tin(01oct2009 00:00, 31oct2009 23:45)"
local Dec2009 "tin(01dec2009 00:00, 31dec2009 23:45)" local Dec2009 "tin(01dec2009 00:00, 31dec2009 23:45)"
...@@ -110,6 +142,8 @@ foreach s of local samples { ...@@ -110,6 +142,8 @@ foreach s of local samples {
* restrict to the matched households * restrict to the matched households
keep if _merge == 3 keep if _merge == 3
drop _merge sample // reduce size of files
xtset ID s_datetime, delta(30 mins) xtset ID s_datetime, delta(30 mins)
* keep the periods we care about * keep the periods we care about
keep if `Oct2009' || `Oct2010' || `Dec2009' || `Dec2010' keep if `Oct2009' || `Oct2010' || `Dec2009' || `Dec2010'
......
########
# Data preparation of the Commission for Energy Regulation (CER)'s Irish Smart Meter Trial data
# - http://www.ucd.ie/issda/data/commissionforenergyregulationcer/
# - original electricity smart meter data = 6 File<n>.txt files (space delimited)
# Easiest way to merge them: cat *.txt > catFiles.txt on a (fast) unix box
# processes the original data for further use:
# - create subsamples for residential data for Oct 2009 - Oct 2010 & Dec 2009 - Dec 2010
# This work was funded by RCUK through the ESRC's Transformative Social Science Programme via the
# "Census 2022: Transforming Small Area Socio-Economic Indicators through 'Big Data'" Project
# - http://gtr.rcuk.ac.uk/project/2D2CD798-4F04-4399-B1AF-D810A233DD21
# - http://www.energy.soton.ac.uk/tag/census2022/
# Copyright (C) 2014 University of Southampton
# Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut,
# https://github.com/dataknut)
# [Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License
# (http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#YMMV - http://en.wiktionary.org/wiki/YMMV
# Housekeeping ----
# clear the workspace
rm(list=ls())
# load any libraries
library(data.table)
# where's the data?
setwd("~/Documents/Work/Data/CER Smart Metering Project/data/")
# need to unzip and concatenate the data first - it comes as 6 zipped text files
# do this using "cat File1.txt File2.txt File3.txt File4.txt File5.txt File6.txt > catFiles.txt"
# load data and fix variables ----
inpath <- "original/CER_both/CER Electricity Revised March 2012/"
# sample allocation data
infile <- "SME and Residential allocations.csv"
CER_HH_sample_DT <- as.data.table(read.csv(paste0(inpath,infile)), key = "ID")
# CER_HH_sample_DT$code:
# 1 = residential
# 2 = SME
# 3 = other
CER_HH_sample_DT$Code <- factor(CER_HH_sample_DT$Code,
levels = c("1","2","3"),
labels = c("Residential", "SME","Other"))
table(CER_HH_sample_DT$Code)
# merge in kWh data ----
# test with File1.txt until we know it works
infile <- "File1"
# then switch to the full data file when we know it works!
#infile <- "catFiles"
# construct the data file name - it is a .txt file
dataf <- paste0(inpath,infile,".txt")
print(paste0("Loading: ", dataf))
CER_HH_kWh_DT <- as.data.table(read.table(dataf,
col.names = c("ID","timestamp", "kWh")), key = "ID")
# now sort the data by ID (household) and then timestamp within ID
CER_HH_kWh_DT <- CER_HH_kWh_DT[with(CER_HH_kWh_DT, order(ID, timestamp)),]
CER_HH_kWh_DT <- merge(CER_HH_kWh_DT, CER_HH_sample_DT, by = "ID")
# how do we specify the columns we want to keep when we merge?
# subset to keep the ones we want (must be a better way)
CER_HH_kWh_DT <- subset(
CER_HH_kWh_DT,
select = c("ID","Code","timestamp", "kWh"))
# create real date/time variables ----
# For some reason the supplied timestamp is:
# Day code: digits 1-3 (day 1 = 1 January 2009)
# digits 4-5 (half hour 1 - 48) 1= 00:00:00 – 00:29:59
CER_HH_kWh_DT$day <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 1, 3))
CER_HH_kWh_DT$halfhour <- as.numeric(substr(CER_HH_kWh_DT$timestamp, 4, 5))
CER_HH_kWh_DT$datetime_z <- as.POSIXct("01/01/2009 00:00:00", tz = , "", "%d/%m/%Y %H:%M:%S")
CER_HH_kWh_DT$datetime_start <- CER_HH_kWh_DT$datetime_z + # start with date zero
(CER_HH_kWh_DT$day*24*60*60) + # add number of days
((CER_HH_kWh_DT$halfhour-1)*30*60) # add halfhours but subtract 1 as first needs to be '0'
# remove unwanted variables to save memory
CER_HH_kWh_DT$timestamp <- NULL
CER_HH_kWh_DT$day <- NULL
CER_HH_kWh_DT$halfhour <- NULL
CER_HH_kWh_DT$datetime_z <- NULL
# save outputs ----
outpath <- "processed"
# save the subsamples, do not save the whole file as it will be very large
# use a pair of loops to prevent typing errors!
years <- c("2009", "2010")
samples <- c("Residential", "SME")
for (y in years) {
for (s in samples) {
print(paste0("Saving ", s, " in ", y))
# October samples
dateSt <- paste0(y,"-10-01")
dateEn <- paste0(y,"-10-31")
date_start<-as.POSIXct(dateSt,tz="")
date_end<-as.POSIXct(dateEn,tz="")
outfile <- paste0(outpath,"/","CER_October_",y,"_",s,".csv")
print(paste0("Saving: ", outfile))
write.csv(
CER_HH_kWh_DT[
CER_HH_kWh_DT$datetime_start %in% date_start:date_end &
CER_HH_kWh_DT$Code == s],
row.names = FALSE,
file = outfile)
# December samples
dateSt <- paste0(y,"-12-01")
dateEn <- paste0(y,"-12-31")
date_start<-as.POSIXct(dateSt,tz="")
date_end<-as.POSIXct(dateEn,tz="")
outfile <- paste0(outpath,"/","CER_December_",y,"_",s,".csv")
print(paste0("Saving: ", outfile))
write.csv(
CER_HH_kWh_DT[
CER_HH_kWh_DT$datetime_start %in% date_start:date_end &
CER_HH_kWh_DT$Code == s],
row.names = FALSE,
file = outfile)
}
}
...@@ -37,7 +37,7 @@ GNU General Public License for more details. ...@@ -37,7 +37,7 @@ GNU General Public License for more details.
global where "~/Documents/Work" global where "~/Documents/Work"
* project * project
global proot "$where/Data/Social Science Datasets/CER Smart Metering Project" global proot "$where/Data/CER Smart Metering Project"
* data * data
global dpath "$proot/data" global dpath "$proot/data"
...@@ -68,8 +68,8 @@ local fill_option "full" ...@@ -68,8 +68,8 @@ local fill_option "full"
* only set this to 1 if you want to refresh ALL the input files! * only set this to 1 if you want to refresh ALL the input files!
local do_raw 0 // process big half hour data file - need to unzip the txt files, takes ages to run local do_raw 0 // process big half hour data file - need to unzip the txt files, takes ages to run
local do_summaries 0 // create summary files at ID level local do_summaries 1 // create summary files at ID level - takes even longer
local do_census2022 1 // create Census2022 Oct 2009 dataset local do_census2022 0 // create Census2022 Oct 2009 dataset
************************************ ************************************
************************************ ************************************
...@@ -113,30 +113,41 @@ if `do_raw' { ...@@ -113,30 +113,41 @@ if `do_raw' {
gen mins = 0 gen mins = 0
replace mins = 30 if mod(halfhour,2) == 1 replace mins = 30 if mod(halfhour,2) == 1
gen sec = 0 gen secs = 0
di "* create stata datetime - this breaks for those halfhours which are within clock changes - i.e. halfhour == 49 or 50" di "* create stata datetime - this breaks for those halfhours which are within clock changes - i.e. halfhour == 49 or 50"
gen double s_datetime = dhms(s_date, hour, mins, sec) gen double stata_datetime = dhms(s_date, hour, mins, secs)
format s_datetime %tc format stata_datetime %tc
di "* drop those where s_datetime = missing - XX this will mean dropping the 49 & 50 halfhours when the clocks changed" di "* drop those where s_datetime = missing - XX this will mean dropping the 49 & 50 halfhours when the clocks changed"
drop if s_datetime == . drop if stata_datetime == .
di "* check first 6 hours" di "* check first 6 hours"
li ID date halfhour s_* in 1/12, sep(2) li ID date halfhour s_* in 1/12, sep(2)
* which day does the data start/end? * which day does the data start/end?
qui: su s_datetime qui: su stata_datetime
di "Data start: " %tc `r(min)' di "Data start: " %tc `r(min)'
di "Data end: " %tc `r(max)' di "Data end: " %tc `r(max)'
* create an R-friendly date time
gen double t_date = dofc(stata_datetime)
gen double t_day = day(t_date)
gen double t_month = month(t_date)
gen double t_year = year(t_date)
egen r_date = concat(t_year t_month t_day), punct("/")
egen r_time = concat(hour mins secs), punct(":")
egen r_datetime = concat(r_date r_time), punct(" ")
di "* Drop all variables we don't need" di "* Drop all variables we don't need"
drop date s_date halfhour hour mins sec drop date s_date halfhour hour mins sec date t_* r_date r_time
di "* test for duplicates (will break xtset if there are)" di "* test for duplicates (will break xtset if there are)"
duplicates tag ID s_datetime, gen(dups) duplicates tag ID stata_datetime, gen(dups)
tabout dups using "$logpath/File`f'-dups-test.txt" tabout dups using "$logpath/File`f'-dups-test.txt", replace
di "* usually a few - list them" di "* usually a few - list them"
li if dups == 1 li if dups == 1
...@@ -145,7 +156,7 @@ if `do_raw' { ...@@ -145,7 +156,7 @@ if `do_raw' {
lab var ID "Household id - links to survey" lab var ID "Household id - links to survey"
lab var kwh "kWh consumed" lab var kwh "kWh consumed"
lab var s_datetime "Time stamp (STATA SIF format)" lab var stata_datetime "Time stamp (STATA SIF format)"
drop dups drop dups
compress compress
...@@ -166,11 +177,15 @@ if `do_raw' { ...@@ -166,11 +177,15 @@ if `do_raw' {
di "* Skip testing for missing for now due to file size" di "* Skip testing for missing for now due to file size"
di "* setting the xt so we don't have to do it again (takes ages)" di "* setting the xt so we don't have to do it again (takes ages)"
xtset ID s_datetime, delta(30 minutes) xtset ID stata_datetime, delta(30 minutes)
di "* save out the whole enormous file" di "* save out the whole enormous file"
preserve
drop r_datetime
save "$dpath/processed/CER-halfhour-electricity-all.dta", replace save "$dpath/processed/CER-halfhour-electricity-all.dta", replace
restore
* as CSV for R etc * as CSV for R etc
drop stata_datetime
outsheet using "$dpath/processed/CER-halfhour-electricity-all.csv", comma replace outsheet using "$dpath/processed/CER-halfhour-electricity-all.csv", comma replace
} }
else { else {
...@@ -184,11 +199,13 @@ if `do_summaries' { ...@@ -184,11 +199,13 @@ if `do_summaries' {
di "*******************************************************" di "*******************************************************"
di "* Creating a summary file by ID" di "* Creating a summary file by ID"
use "$dpath/processed/CER-halfhour-electricity-all.dta", clear
gen obs = 1 gen obs = 1
collapse (min) min_s_datetime = s_datetime (max) max_s_datetime = s_datetime (sum) obs kwh, by(ID) collapse (min) min_s_datetime = stata_datetime (max) max_s_datetime = stata_datetime (sum) obs kwh, by(ID)
gen kwh_mean = kwh_sum/obs gen kwh_mean = kwh/obs
lab var kwh_mean "Mean kwh per half hour across all obs for this ID" lab var kwh_mean "Mean kwh per half hour across all obs for this ID"
su obs kwh_mean, de su obs kwh_mean, de
...@@ -273,19 +290,21 @@ if `do_census2022' { ...@@ -273,19 +290,21 @@ if `do_census2022' {
di "* select 4 weeks around October 2009 as it matches to pre-trial survey closely" di "* select 4 weeks around October 2009 as it matches to pre-trial survey closely"
di "* -> Census2022 'sample'" di "* -> Census2022 'sample'"
use "$dpath/processed/CER-halfhour-electricity-all.dta", clear
* should already be xtset from 'do_raw' * should already be xtset from 'do_raw'
keep if tin(28sept2009 00:00, 25oct2009 23:55) keep if tin(28sept2009 00:00, 25oct2009 23:55)
* check * check
* which day does the data start/end? * which day does the data start/end?
qui: su s_datetime qui: su stata_datetime
di "Data start: " %tc `r(min)' di "Data start: " %tc `r(min)'
di "Data end: " %tc `r(max)' di "Data end: " %tc `r(max)'
di "* create day of week (remember in stata 0 = Sunday)" di "* create day of week (remember in stata 0 = Sunday)"
gen s_dow = dow(dofc(s_datetime)) gen s_dow = dow(dofc(stata_datetime))
di "* how many are missing?" di "* how many are missing?"
tab s_dow, mi tab s_dow, mi
...@@ -311,11 +330,11 @@ if `do_census2022' { ...@@ -311,11 +330,11 @@ if `do_census2022' {
* NB: if we did not specify the 'full' option this will only impute missing datetimes * NB: if we did not specify the 'full' option this will only impute missing datetimes
* between the first & last ID observation - so it fills in gaps, it does NOT full up all * between the first & last ID observation - so it fills in gaps, it does NOT full up all
* possible datetimes from the start to the end of the sample * possible datetimes from the start to the end of the sample
gen double s_dow = dow(dofc(s_datetime)) gen double s_dow = dow(dofc(stata_datetime))
gen double s_date = mdy(month(dofc(s_datetime)), day(dofc(s_datetime)), year(dofc(s_datetime))) gen double s_date = mdy(month(dofc(stata_datetime)), day(dofc(stata_datetime)), year(dofc(stata_datetime)))
format s_date %td format s_date %td
gen hours = hh(s_datetime) gen hours = hh(stata_datetime)
gen mins = mm(s_datetime) gen mins = mm(stata_datetime)
gen secs = 0 gen secs = 0
gen double s_halfhour = hms(hours, mins, secs) gen double s_halfhour = hms(hours, mins, secs)
format s_halfhour %tcHH:MM format s_halfhour %tcHH:MM
......
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment