CSV文件格式

分析CSV文件头

csv模块包含在Python标准库中,可用于分析CSV文件中的数据行

1
2
3
4
5
6
7
import csv
file_name = 'sitka_weather_07-2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
print(reader)
heaeder_row = next(reader)
print(heaeder_row)
<_csv.reader object at 0x00000233AD31CEE0>
['AKDT', 'Max TemperatureF', 'Mean TemperatureF', 'Min TemperatureF', 'Max Dew PointF', 'MeanDew PointF', 'Min DewpointF', 'Max Humidity', ' Mean Humidity', ' Min Humidity', ' Max Sea Level PressureIn', ' Mean Sea Level PressureIn', ' Min Sea Level PressureIn', ' Max VisibilityMiles', ' Mean VisibilityMiles', ' Min VisibilityMiles', ' Max Wind SpeedMPH', ' Mean Wind SpeedMPH', ' Max Gust SpeedMPH', 'PrecipitationIn', ' CloudCover', ' Events', ' WindDirDegrees']
  • csv.reader() 创建一个阅读器对象
  • next() 返回文件中的下一行

打印文件头及其位置

1
2
3
4
5
6
7
8
9
import csv

file_name = 'sitka_weather_07-2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

for index, column_header in enumerate(heaeder_row):
print(index, column_header)
0 AKDT
1 Max TemperatureF
2 Mean TemperatureF
3 Min TemperatureF
4 Max Dew PointF
5 MeanDew PointF
6 Min DewpointF
7 Max Humidity
8  Mean Humidity
9  Min Humidity
10  Max Sea Level PressureIn
11  Mean Sea Level PressureIn
12  Min Sea Level PressureIn
13  Max VisibilityMiles
14  Mean VisibilityMiles
15  Min VisibilityMiles
16  Max Wind SpeedMPH
17  Mean Wind SpeedMPH
18  Max Gust SpeedMPH
19 PrecipitationIn
20  CloudCover
21  Events
22  WindDirDegrees

提取并读取数据

读取每天的最高气温,也就是“Max TemperatureF“”

1
2
3
4
5
6
7
8
9
10
11
12
13
import csv

file_name = 'sitka_weather_07-2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

highs = []
for row in reader:
high = int(row[1])
highs.append(high)

print(highs)
[64, 71, 64, 59, 69, 62, 61, 55, 57, 61, 57, 59, 57, 61, 64, 61, 59, 63, 60, 57, 69, 63, 62, 59, 57, 57, 61, 59, 61, 61, 66]

我们创建了一个名为highs的空列表,再遍历文件中余下的各行。由于我们已经读取了文件头行,这个循环将从第二行实际数据开始

绘制气温图表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import csv
import matplotlib.pyplot as plt

file_name = 'sitka_weather_07-2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

highs = []
for row in reader:
high = int(row[1])
highs.append(high)

# 绘制折线图
fig = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(highs, c='red')

# 设置标题和标签等
plt.title("Daily high temperatures, July 2014", fontsize=24)
plt.xlabel('', fontsize=16)
plt.ylabel("Temperature (F)", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)

plt.show()

png

模块datetime

在图表中添加日期

  • datetime.strptime(date_string, format_string) 将字符串解析为 datetime 对象

image.png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import csv
import matplotlib.pyplot as plt
from datetime import datetime

file_name = 'sitka_weather_07-2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

dates, highs = [], []
for row in reader:
current_date = datetime.strptime(row[0], "%Y-%m-%d")
dates.append(current_date)
high = int(row[1])
highs.append(high)

fig = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(dates, highs, c='red')

plt.title("Daily high temperatures, July 2014", fontsize=24)
plt.xlabel('', fontsize=16)
plt.ylabel("Temperature (F)", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)

# 自动调整图形的 x 轴刻度文本的方向,通常在绘制日期或时间序列数据时使用,以确保日期标签不会相互重叠
fig.autofmt_xdate()

plt.show()

png

涵盖更长的时间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import csv
import matplotlib.pyplot as plt
from datetime import datetime

# 从文件中获取日期和最高气温
file_name = 'sitka_weather_2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

dates, highs = [], []
for row in reader:
current_date = datetime.strptime(row[0], "%Y-%m-%d")
dates.append(current_date)
high = int(row[1])
highs.append(high)

fig = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(dates, highs, c='red')

plt.title("Daily high temperatures, 2014", fontsize=24)
plt.xlabel('', fontsize=16)
plt.ylabel("Temperature (F)", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
fig.autofmt_xdate()

plt.show()

png

添加最低气温数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import csv
import matplotlib.pyplot as plt
from datetime import datetime

# 从文件中获取日期、最高气温和最低气温
file_name = 'sitka_weather_2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

dates, highs, lows = [], [], []
for row in reader:
current_date = datetime.strptime(row[0], "%Y-%m-%d")
dates.append(current_date)
high = int(row[1])
highs.append(high)
low = int(row[3])
lows.append(low)

fig = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(dates, highs, c='red', label='the hottest temperature')
plt.plot(dates, lows, c='blue', label='the coldest tempurature')

plt.title("Daily high and low temperatures - 2014", fontsize=24)
plt.xlabel('', fontsize=16)
plt.ylabel("Temperature (F)", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
fig.autofmt_xdate()
# 显示最高最低气温图例
plt.legend()

plt.show()

png

给图表区域着色

plt.fill_between(x, y1, y2, where=None, **kwargs) 在两个数据序列之间填充颜色

  • y1:y 轴的下界数据点。
  • y2:y 轴的上界数据点。
  • where:一个布尔数组,指定在哪些位置填充颜色
  • facecolor: 设置填充区域的颜色
  • alpha: 设置透明度
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import csv
import matplotlib.pyplot as plt
from datetime import datetime

# 从文件中获取日期、最高气温和最低气温
file_name = 'sitka_weather_2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

dates, highs, lows = [], [], []
for row in reader:
current_date = datetime.strptime(row[0], "%Y-%m-%d")
dates.append(current_date)
high = int(row[1])
highs.append(high)
low = int(row[3])
lows.append(low)

fig = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(dates, highs, c='red', label='the hottest temperature', alpha=0.5)
plt.plot(dates, lows, c='blue', label='the coldest tempurature', alpha=0.5)
# 给图表区域着色
plt.fill_between(dates, lows, highs, facecolor='green', alpha=0.1)

plt.title("Daily high and low temperatures - 2014", fontsize=24)
plt.xlabel('', fontsize=16)
plt.ylabel("Temperature (F)", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
fig.autofmt_xdate()
plt.legend()

plt.show()

png

错误检查

我们应该能够使用有关任何地方的天气数据来运行highs_lows.py中的代码,但有些气象站会偶尔出现故障,未能收集部分或全部其应该收集的数据。缺失数据可能会引发异常,如果不妥善地处理,还可能导致程序崩溃。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import csv
import matplotlib.pyplot as plt
from datetime import datetime

# 从文件中获取日期、最高气温和最低气温
file_name = 'death_valley_2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

dates, highs, lows = [], [], []
for row in reader:
current_date = datetime.strptime(row[0], "%Y-%m-%d")
dates.append(current_date)
high = int(row[1])
highs.append(high)
low = int(row[3])
lows.append(low)

fig = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(dates, highs, c='red', label='the hottest temperature', alpha=0.5)
plt.plot(dates, lows, c='blue', label='the coldest tempurature', alpha=0.5)
# 给图表区域着色
plt.fill_between(dates, lows, highs, facecolor='green', alpha=0.1)

plt.title("Daily high and low temperatures - 2014", fontsize=24)
plt.xlabel('', fontsize=16)
plt.ylabel("Temperature (F)", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
fig.autofmt_xdate()
plt.legend()

plt.show()
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

Cell In[42], line 15
     13 current_date = datetime.strptime(row[0], "%Y-%m-%d") 
     14 dates.append(current_date) 
---> 15 high = int(row[1])
     16 highs.append(high)
     17 low = int(row[3])


ValueError: invalid literal for int() with base 10: ''

该traceback指出,Python无法处理其中一天的最高气温,因为它无法将空字符串(’ ')转换为整数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import csv
import matplotlib.pyplot as plt
from datetime import datetime

# 从文件中获取日期、最高气温和最低气温
file_name = 'death_valley_2014.csv'
with open(file_name) as f:
reader = csv.reader(f)
heaeder_row = next(reader)

dates, highs, lows = [], [], []
for row in reader:
try:
current_date = datetime.strptime(row[0], "%Y-%m-%d")
high = int(row[1])
low = int(row[3])
except ValueError:
print(current_date, 'missing data')
else:
highs.append(high)
dates.append(current_date)
lows.append(low)

fig = plt.figure(dpi=128, figsize=(10, 6))
plt.plot(dates, highs, c='red', label='the hottest temperature', alpha=0.5)
plt.plot(dates, lows, c='blue', label='the coldest tempurature', alpha=0.5)
# 给图表区域着色
plt.fill_between(dates, lows, highs, facecolor='green', alpha=0.1)

title = "Daily high and low temperatures - 2014\nDeath Valley, CA"
plt.title(title, fontsize=20)
plt.xlabel('', fontsize=16)
plt.ylabel("Temperature (F)", fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
fig.autofmt_xdate()
plt.legend()

plt.show()
2014-02-16 00:00:00 missing data

png

制作世界人口地图:JSON格式

提取相关的数据

json.load() 用于从文件中读取 JSON 数据,并将其解析为 Python 对象(通常是字典或列表)

1
2
3
4
5
6
7
8
9
10
11
12
13
import json

# 将数据加载到一个列表中
filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

# 打印每个国家2010年的人口数量
for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
population = pop_dict['Value']
print(country_name + ": " + population)
Arab World: 357868000
Caribbean small states: 6880000
East Asia & Pacific (all income levels): 2201536674
-- __snip__ --
Yemen, Rep.: 24053000
Zambia: 12927000
Zimbabwe: 12571000

将字符串转换为数字值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import json

# 将数据加载到一个列表中
filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

# 打印每个国家2010年的人口数量
for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
# 函数float()将字符串转换为小数,而函数int()丢弃小数部分,返回一个整数
population = int(float(pop_dict['Value']))
print(country_name + ": " + str(population))
Arab World: 357868000
Caribbean small states: 6880000
East Asia & Pacific (all income levels): 2201536674
-- __snip__ --
Yemen, Rep.: 24053000
Zambia: 12927000
Zimbabwe: 12571000

获取两个字母的国别码

浅浅的使用COUNTRIES来识别国别码

1
2
3
4
from pygal_maps_world.i18n import COUNTRIES

for country_code in sorted(COUNTRIES.keys()):
print(country_code, COUNTRIES[country_code])
ad Andorra
ae United Arab Emirates
af Afghanistan
-- __snip__ --
zm Zambia
zw Zimbabwe
1
2
3
4
5
6
7
8
9
10
11
12
13
from pygal_maps_world.i18n import COUNTRIES

def get_country_code(country_name):
"""根据指定的国家,返回Pygal使用的两个字母的国别码"""
for code, name in COUNTRIES.items():
if name == country_name:
return code
# 如果没有找到指定的国家,就返回None
return None

print(get_country_code('Andorra'))
print(get_country_code('United Arab Emirates'))
print(get_country_code('Afghanistan'))
ad
ae
af
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import json

# 将数据加载到一个列表中
filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

# 打印每个国家2010年的人口数量
for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
population = int(float(pop_dict['Value']))
code = get_country_code(country_name)
if code:
print(code + ": " + str(population))
else:
print('ERROR - ' + country_name)
ERROR - Arab World
ERROR - Caribbean small states
-- __snip__ --
ERROR - Yemen, Rep.
zm: 12927000
zw: 12571000

制作世界地图

1
2
3
4
5
6
7
8
9
10
11
12
13
import pygal_maps_world.maps
from IPython.display import SVG

wm = pygal_maps_world.maps.World()
wm.title = 'North, Central, and South America'

wm.add('North America', ['ca', 'mx', 'us'])
wm.add('Central America', ['bz', 'cr', 'gt', 'hn', 'ni', 'pa', 'sv'])
wm.add('South America', ['ar', 'bo', 'br', 'cl', 'co', 'ec', 'gf',
'gy', 'pe', 'py', 'sr', 'uy', 've'])

wm.render_to_file('americas.svg')
SVG('americas.svg')

svg

在世界地图上呈现数字数据

1
2
3
4
5
6
7
8
import pygal_maps_world.maps
from IPython.display import SVG

wm = pygal_maps_world.maps.World()
wm.title = 'Populations of Countries in North America'
wm.add('North America', {'ca': 34126000, 'us': 309349000, 'mx': 113423000})
wm.render_to_file('na_populations.svg')
SVG('na_populations.svg')

svg

绘制完整的世界人口地图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import json
import pygal_maps_world.maps
from IPython.display import SVG

filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

cc_populations = {}

for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
population = int(float(pop_dict['Value']))
code = get_country_code(country_name)
if code:
cc_populations[code] = population

wm = pygal_maps_world.maps.World()
wm.title = 'World Population in 2010, by Country'
wm.add('2010', cc_populations)

wm.render_to_file('world_population.svg')
SVG('world_population.svg')

svg

根据人口数量将国家分组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import json
import pygal_maps_world.maps
from IPython.display import SVG

filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

cc_populations = {}

for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
population = int(float(pop_dict['Value']))
code = get_country_code(country_name)
if code:
cc_populations[code] = population

# 根据人口数量将所有的国家分成三组
cc_pops_1, cc_pops_2, cc_pops_3 = {}, {}, {}
for cc, pop in cc_populations.items():
if pop < 10000000:
cc_pops_1[cc] = pop
elif pop < 100000000:
cc_pops_2[cc] = pop
else:
cc_pops_3[cc] = pop

wm = pygal_maps_world.maps.World()
wm.title = 'World Population in 2010, by Country'
wm.add('0-10m', cc_pops_1)
wm.add('10m-1bn', cc_pops_2)
wm.add('>1bn', cc_pops_3)

wm.render_to_file('world_population.svg')
SVG('world_population.svg')

svg

使用Pygal设置世界地图的样式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import json
import pygal_maps_world.maps
from pygal.style import RotateStyle
from IPython.display import SVG

filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

cc_populations = {}

for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
population = int(float(pop_dict['Value']))
code = get_country_code(country_name)
if code:
cc_populations[code] = population

cc_pops_1, cc_pops_2, cc_pops_3 = {}, {}, {}
for cc, pop in cc_populations.items():
if pop < 10000000:
cc_pops_1[cc] = pop
elif pop < 100000000:
cc_pops_2[cc] = pop
else:
cc_pops_3[cc] = pop

wm_style = RotateStyle('#336699')
wm = pygal_maps_world.maps.World(style = wm_style)
wm.title = 'World Population in 2010, by Country'
wm.add('0-10m', cc_pops_1)
wm.add('10m-1bn', cc_pops_2)
wm.add('>1bn', cc_pops_3)

wm.render_to_file('world_population.svg')
SVG('world_population.svg')

svg

加亮颜色主题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import json
import pygal_maps_world.maps
from pygal.style import LightColorizedStyle
from IPython.display import SVG

filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

cc_populations = {}

for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
population = int(float(pop_dict['Value']))
code = get_country_code(country_name)
if code:
cc_populations[code] = population

cc_pops_1, cc_pops_2, cc_pops_3 = {}, {}, {}
for cc, pop in cc_populations.items():
if pop < 10000000:
cc_pops_1[cc] = pop
elif pop < 100000000:
cc_pops_2[cc] = pop
else:
cc_pops_3[cc] = pop

wm = pygal_maps_world.maps.World(style = LightColorizedStyle)
wm.title = 'World Population in 2010, by Country'
wm.add('0-10m', cc_pops_1)
wm.add('10m-1bn', cc_pops_2)
wm.add('>1bn', cc_pops_3)

wm.render_to_file('world_population.svg')
SVG('world_population.svg')

svg

改变颜色和加亮颜色主题结合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import json
import pygal_maps_world.maps
from pygal.style import LightColorizedStyle, RotateStyle
from IPython.display import SVG

filename = 'population_data.json'
with open(filename) as f:
pop_data = json.load(f)

cc_populations = {}

for pop_dict in pop_data:
if pop_dict['Year'] == '2010':
country_name = pop_dict['Country Name']
population = int(float(pop_dict['Value']))
code = get_country_code(country_name)
if code:
cc_populations[code] = population

cc_pops_1, cc_pops_2, cc_pops_3 = {}, {}, {}
for cc, pop in cc_populations.items():
if pop < 10000000:
cc_pops_1[cc] = pop
elif pop < 100000000:
cc_pops_2[cc] = pop
else:
cc_pops_3[cc] = pop

wm_style = RotateStyle('#336699', base_style=LightColorizedStyle)
wm = pygal_maps_world.maps.World(style = wm_style)
wm.title = 'World Population in 2010, by Country'
wm.add('0-10m', cc_pops_1)
wm.add('10m-1bn', cc_pops_2)
wm.add('>1bn', cc_pops_3)

wm.render_to_file('world_population.svg')
SVG('world_population.svg')

svg

在本章中,你学习了:如何使用网上的数据集;如何处理CSV和JSON文件,以及如何提取你感兴趣的数据;如何使用matplotlib来处理以往的天气数据,包括如何使用模块datetime,以及如何在同一个图表中绘制多个数据系列;如何使用Pygal绘制呈现各国数据的世界地图,以及如何设置Pygal地图和图表的样式。
有了使用CSV和JSON文件的经验后,你将能够处理几乎任何要分析的数据。大多数在线数据集都可以以这两种格式中的一种或两种下载。学习使用这两种格式为学习使用其他格式的数据做好了准备。