extract-DECC-LSOA-energy-data-from-excel.do 12.1 KB
Newer Older
1
2
3
*******************************************
* Script to:
* - Extract data from DECC LSOA level domestic energy consumption data (https://www.gov.uk/government/collections/mlsoa-and-llsoa-electricity-and-gas-estimates)
Ben Anderson's avatar
Ben Anderson committed
4

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/*   

Copyright (C) 2014  University of Southampton

Author: Ben Anderson (b.anderson@soton.ac.uk, @dataknut, https://github.com/dataknut) 
	[Energy & Climate Change, Faculty of Engineering & Environment, University of Southampton]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License 
(http://choosealicense.com/licenses/gpl-2.0/), or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

#YMMV - http://en.wiktionary.org/wiki/YMMV

*/
Ben Anderson's avatar
Ben Anderson committed
25
26
27
28
29
30
31

clear all

capture log close

set more off

Ben Anderson's avatar
Ben Anderson committed
32
33
34
35
36
local where = "/Users/ben/Documents"
local projroot = "`where'/Work/Data/Social Science Datatsets/DECC"

local dpath = "`projroot'/LSOA Energy Data"
local dofiles = "`projroot'/do_files"
Ben Anderson's avatar
Ben Anderson committed
37
38
39
local rpath = "`projroot'/LSOA Energy Data/processed"

* census data location
Ben Anderson's avatar
Ben Anderson committed
40
local cpath = "`where'/Work/Data/Social Science Datatsets/UK Census/2001Data"
Ben Anderson's avatar
Ben Anderson committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
local eimdpath = "`where'/Work/Data/Social Science Datatsets/Indices of Deprivation/English ID 2010"
local pcluts = "`where'/Work/Data/GIS data/UK Postcodes/pcluts_2007nov_processed"
local urbpath = "`where'/Work/Data/Social Science Datatsets/UK Urban Rural Classification"

log using "`rpath'/extract-DECC-LSOA-energy-data-from-excel-$S_DATE.smcl", replace

local years "2007 2008 2009 2010"
* set to 1 to get set-up etc to run
local do_2007 = 0
local do_2008 = 0
local do_2009 = 0
local do_2010 = 0
local do_matrix_graphs = 0
local do_bar_graphs = 1




* 2007 = experimental pilots

local dist_testvars "morphologyname imd2010_dec"

* 2008
if `do_2008' {
	* 2008 - data in GOR tab format!!
	local y = "2008"
	local gor1 "North West"
	local gor2 "West Midlands"
	local gor3 "East Midlands"
	local gor4 "North East"
	local gor5 "Yorkshire and Humber"
	local gor6 "East of England"
	local gor7 "Greater London"
	local gor8 "South East"
	local gor9 "South West"
	local gor10 "Wales"
	
	foreach g of numlist 1/10 {
		di "* Importing GOR `g'"
		capture noisily import excel "`dpath'/`y'/790-llsoa-electricity-`y'-england-wales.xls", sheet("`gor`g''") cellrange(A13) allstring firstrow clear
		save "`dpath'/`y'/790-llsoa-electricity-`y'-gor`g'.dta", replace
	}
	
	* start with first
	use "`dpath'/`y'/790-llsoa-electricity-`y'-gor1.dta", clear
	
	* append the others
	foreach g of numlist 2/10 {
		append using "`dpath'/`y'/790-llsoa-electricity-`y'-gor`g'.dta"
	}
	
	destring Ordinarydomesticconsumption Economy7consumption Numberofordinarydomesticmete ///
		Numberofeconomy7meters Averageordinarydomesticconsum Averageeconomy7consumption, replace float force
	
	
	rename LLSOAcode lsoacode_`y'
	gen zonecode = lsoacode_`y'
	
	* duplicates? Caused by disclosure control?
	duplicates report zonecode
	duplicates tag zonecode, gen(dups_`y')
	gen gor_`y' = GOR
	lab var gor_`y' "Government Office Region (`y' data)"
	*lab def gor_`y' 1 "North East" 2 "North West" 3 "Yorkshire and Humber" 4 "East Midlands" 5 "West Midlands" 6 "East of England" 7 "Greater London" 8 "South East" 9 "South West" 10 "Wales"
	*lab val gor_`y' gor_`y'
	tab gor_`y' dups_`y', mi
	duplicates drop zonecode, force
	
	rename Ordinarydomesticconsumption OrdConsumption_`y'
	rename Economy7consumption Econ7consumption_`y'
	rename Numberofordinarydomesticmete NumOrdMeters_`y'
	rename Numberofeconomy7meters NumEcon7Meters_`y'
	rename Averageordinarydomesticconsum AvgOrdConsumption_`y'
	rename Averageeconomy7consumption AvgEcon7consumption_`y'
	
	lab var OrdConsumption_`y' "Ordinary domestic consumption `y'"
	lab var Econ7consumption_`y' "Economy 7 consumption `y'"
	lab var NumOrdMeters_`y' "Number of ordinary domestic meters `y'"
	lab var NumEcon7Meters_`y' "Number of Economy 7 meters `y'"
	lab var AvgOrdConsumption_`y' "Average domestic consumption `y'"
	lab var AvgEcon7consumption_`y' "Average Economy 7 consumption `y'"


	save "`dpath'/processed/llsoa-electricity-`y'-all.dta", replace
	di "* Done `y'"
	di "*****************************"
}

if `do_2009' {
	* 2009 - data in .csv files
	local y = "2009"
	insheet using "`dpath'/`y'/2347-llsoa-domestic-elec-raw.csv", clear
	rename llsoa lsoacode_`y'
	gen zonecode = lsoacode_`y'
	rename consumption_dom OrdConsumption_`y'
	lab var OrdConsumption_`y' "Ordinary domestic consumption `y'"
	rename consumption_e7 Econ7consumption_`y'
	lab var Econ7consumption_`y' "Economy 7 consumption `y'"
	rename no_mpan_dom NumOrdMeters_`y'
	lab var NumOrdMeters_`y' "Number of ordinary domestic meters `y'"
	rename no_mpan_e7 NumEcon7Meters_`y'
	lab var NumEcon7Meters_`y' "Number of Economy 7 meters `y'"
	rename av_consumption_dom AvgOrdConsumption_`y'
	lab var AvgOrdConsumption_`y' "Average domestic consumption `y'"
	rename av_consumption_e7 AvgEcon7consumption_`y'
	lab var AvgEcon7consumption_`y' "Average Economy 7 consumption `y'"
	* new for 2009
	rename no_dis_dom no_dis_dom_`y'
	rename no_dis_e7 no_dis_e7_`y'
	
	* duplicates? Caused by disclosure control?
	duplicates report zonecode
	duplicates tag zonecode, gen(dups_`y')

	tab la_name dups_`y', mi
	duplicates drop zonecode, force

	save "`dpath'/processed/llsoa-electricity-`y'-all.dta", replace
	di "* Done `y'"
	di "*****************************"
 
}

if `do_2010' {
	* 2010 - data 
	local y = "2010"
	
	import excel "`dpath'/`y'/4813-llsoa-domestic-elec-est-`y'-fixed.xls", sheet("LLSOA Electricity Domestic") firstrow clear

	* insheet using "`dpath'/`y'/4813-llsoa-domestic-elec-est-2010.csv", clear comma
	rename lsoa lsoacode_`y'
	gen zonecode = lsoacode_`y'
	rename  Ordinarydomesticconsumption OrdConsumption_`y'
	lab var OrdConsumption_`y' "Ordinary domestic consumption `y'"
	rename  Economy7consumption Econ7consumption_`y'
	lab var Econ7consumption_`y' "Economy 7 consumption `y'"
	rename  Numberofordinarydomesticmeters NumOrdMeters_`y'
	lab var NumOrdMeters_`y' "Number of ordinary domestic meters `y'"
	rename  Numberofeconomy7meters NumEcon7Meters_`y'
	lab var NumEcon7Meters_`y' "Number of Economy 7 meters `y'"
	rename  Averageordinarydomesticconsumption AvgOrdConsumption_`y'
	lab var AvgOrdConsumption_`y' "Average domestic consumption `y'"
	rename  Averageeconomy7consumption AvgEcon7consumption_`y'
	lab var AvgEcon7consumption_`y' "Average Economy 7 consumption `y'"
	
	* duplicates? Caused by disclosure control?
	duplicates report zonecode
	duplicates tag zonecode, gen(dups_`y')

	tab la_name dups_`y', mi
	duplicates drop zonecode, force

	save "`dpath'/processed/llsoa-electricity-`y'-all.dta", replace
	di "* Done `y'"
	di "*****************************"

}

* merge the files starting with baseline LSOA geography file so we can see what is missing
use "`urbpath'/Eng_Wales/lsoa/RUURB_2005_MAR_LSOA_EW.dta", clear

merge 1:1 zonecode using "`pcluts'/NSPDF_NOV_2007_UK_1M_ew_lsoacode.dta", gen(m_lsoa_postcode)

* English IMD 2010
* non-matches will be Wales
merge 1:1 zonecode using "`eimdpath'/ID-2010-indices-domains.dta", gen(m_lsoa_eimd)
* English address & postcode counts
merge 1:1 zonecode using "`pcluts'/NSPDF_NOV_2007_UK_1M_uk_lsoacode_addr_counts.dta", gen(m_address_counts)

* now DECC data - will be non-matches due to aggregation/non-disclosure
merge 1:1 zonecode using "`dpath'/processed/llsoa-electricity-2008-all.dta", gen(m_elec_2008)
merge 1:1 zonecode using "`dpath'/processed/llsoa-electricity-2009-all.dta", gen(m_elec_2009)
merge 1:1 zonecode using "`dpath'/processed/llsoa-electricity-2010-all.dta", gen(m_elec_2010)

* first line is junk for some reason
li in 1/2
drop in 1
	
* test merges
foreach m of varlist m_* {
	tab `m'
}

su *_2008 *_2009 *_2010

local testvars "OrdConsumption Econ7consumption AvgOrdConsumption AvgEcon7consumption"
local years "2008 2009 2010"
local diffyears "2008 2009"

foreach v of local testvars {
	* 2009 - 2008
	gen `v'_09_08_dif = `v'_2009 - `v'_2008
	lab var `v'_09_08_dif "Change in `v' (2009 - 2008)"
	
	gen `v'_09_08_difpc = 100*(`v'_09_08_dif/`v'_2008)
	lab var `v'_09_08_difpc "Change in `v' (2009 - 2008) as % of 2008"
	
	* 2010 - 2009
	gen `v'_10_09_dif = `v'_2010 - `v'_2009
	lab var `v'_10_09_dif "Change in `v' (2010 - 2009)"

	gen `v'_10_09_difpc = 100*(`v'_10_09_dif/`v'_2009)
	lab var `v'_10_09_difpc "Change in `v' (2010 - 2009) as % of 2009"

	* 2010 - 2008
	gen `v'_10_08_dif = `v'_2010 - `v'_2008
	lab var `v'_10_08_dif "Change in `v' (2010 - 2008)"

	gen `v'_10_08_difpc = 100*(`v'_10_08_dif/`v'_2008)
	lab var `v'_10_08_difpc "Change in `v' (2010 - 2008) as % of 2008"

	foreach y of local years {
		di "****************"
		di "* Testing `v' for `y'"
		di "*"
		gen z`v'_`y' = 0
		lab var z`v'_`y' "Value is less than zero (`v', `y')"
		replace z`v'_`y' = 1 if `v'_`y' < 0
		
		* How many LSOAs have negative values?
		table regionname morphologyname z`v'_`y' 
		* which LSOAs are they?
		li regionname districtname lowersoacode NumOrdMeters_`y' NumEcon7Meters_`y' z`v'_`y' `v'_`y' if z`v'_`y' == 1, sep(0) noobs
		di "*"
		di "* End testing `v' for `y'"
		di "****************"
	}
	di "****************"
}


if `do_matrix_graphs' {
	* test year change

	graph matrix OrdConsumption_*, msize(tiny) name(OrdConsumption) half scale(0.75)
	graph export "`rpath'/matrix-OrdConsumption.png", replace
	graph matrix Econ7consumption_*, msize(tiny) name(Econ7consumption) half scale(0.75)
	graph export "`rpath'/matrix-Econ7consumption.png", replace
	
	graph matrix NumOrdMeters_*, msize(tiny) name(NumOrdMeters) half scale(0.75) 
	graph export "`rpath'/matrix-NumOrdMeters.png", replace
	graph matrix NumEcon7Meters_*, msize(tiny) name(NumEcon7Meters) half scale(0.75) 
	graph export "`rpath'/matrix-NumEcon7Meters.png", replace
	
	graph matrix AvgOrdConsumption_*, msize(tiny) name(AvgOrdConsumption) half scale(0.75) 
	graph export "`rpath'/matrix-AvgOrdConsumption.png", replace
	graph matrix AvgEcon7consumption_*, msize(tiny) name(AvgEcon7consumption) half scale(0.75) 
	graph export "`rpath'/matrix-AvgEcon7consumption.png", replace
	
	* compare with address counts
	graph matrix NumOrdMeters_* g_uk_address_count g_uk_deliverypoint_count, msize(tiny) name(address_counts) half scale(0.75)
	graph export "`rpath'/matrix-ord-meter-address-counts.png", replace
}

if `do_bar_graphs' {
	graph hbar *_difpc, by(morphologyname) name(hbar_morph)
	graph export "`rpath'/hbar-difpc-morphologyname.png", replace
	graph hbar *_difpc, by(imd2010_dec) name(hbar_imd2010_dec)
	graph export "`rpath'/hbar-difdif-c-imd2010_dec.png", replace

}

drop v*

su z*

su *dif*

local vars = "NumOrdMeters NumEcon7Meters"
local years "2008 2009 2010"

foreach v of local vars {
	foreach y of local years {
		gen `v'_`y'_addr_ppn = `v'_`y'/g_uk_address_count
		gen `v'_`y'_delptc_ppn = `v'_`y'/g_uk_deliverypoint_count
	}
}

local vars = "OrdConsumption_ Econ7consumption_ AvgOrdConsumption_ AvgEcon7consumption_ NumOrdMeters_ NumEcon7Meters_"

foreach dv of local dist_testvars {
	di "* Testing by `dv'"
	foreach v of local vars {
		di "* -> Testing `v'* "
		tabstat `v'*, by(`dv')
	}
	di "* -> Testing NumMeters_* "
	tabstat *Meters_*_addr_ppn , by(`dv')
	tabstat *Meters_*_delptc_ppn , by(`dv')
}


preserve
	keep zonecode regioncode regionname countycode countyname districtcode districtname *200* *dif*
	outsheet using "`dpath'/processed/llsoa-electricity-all-years-England-geo.csv", comma replace
	outsheet using "`dpath'/processed/llsoa-electricity-all-years-England-geo-southwest.csv" if regionname == "South West", comma replace
	export excel using "`dpath'/processed/llsoa-electricity-all-years-England-geo.xls", sheet("data") firstrow(variables) replace

restore
save "`dpath'/processed/llsoa-electricity-all-years-England-geo.dta", replace


log close