Parsing and Exploring Data

Python

by techbard 2025. 4. 7. 16:42

### Load and parse a JSON data file and determine some information about it.

### 'date': '2022-05-24',
### 'tmin': 61,
### 'tmax': 80,
### 'prcp': 0.0,
### 'snow': 0.0,
### 'snwd': 0.0,
### 'awnd': 8.3},

### 'date': '2022-05-31',
### 'tmin': 68,
### 'tmax': 92,
### 'prcp': 0.0,
### 'snow': 0.0,
### 'snwd': 0.0,
### 'awnd': 4.0

import json
import os
import pprint

file_path = os.path.join('2025-03-10', 'rdu-weather-history.json')

### open the sample weather data file and use the json module to load and parse.
with open(file_path, "r") as weather_file:
    weather_data = json.load(weather_file)

# print(len(weather_data))

# first item in the data
# pprint.pp(weather_data[0])

years = {}

# How many days of data do we have for each year?
for d in weather_data:
    key = d['date'][0:4]
    if key in years:
        years[key] += 1
    else:
        years[key] = 1

# pprint.pp(years, width=5)
# pprint.pp(years)

### What was the warmest day in the data set?
warm_day = {'date': '0000-00-00', 'tmax': 0}

for data in weather_data:
    if data['tmax'] > warm_day['tmax']:
        warm_day['tmax'] = data['tmax']
        warm_day['date'] = data['date']

print(f"The warmest day was {warm_day['date']} at {warm_day['tmax']} degrees.")

### What was the coldest day in the data set?
cold_day = min(weather_data, key=lambda x: x['tmin'])
print(f"The coldest day was {cold_day['date']} at {cold_day['tmax']} degrees.")

# output
# The warmest day was 2017-07-23 at 102 degrees.
# The coldest day was 2018-01-07 at 30 degrees.

### How many days had snowfall?
snow_days = {'Count of snow day': 0}
for data in weather_data:
    if data['snow'] > 0.0:
        snow_days['Count of snow day'] += 1

print(snow_days)

# output
# {'Count of snow day': 15}

snow_days = [data['date'] for data in weather_data if data['snow'] > 0.0]
print(f"Snow fell on {len(snow_days)} days.")

# output
# Snow fell on 15 days.

import os
import json
import pprint

### It's a very common scenario to only wnat to work
### on a subset of a larger dataset.
### In other words, you want to filter out the values that
### you're not interested in to be able to focus on
### the values that you are interested in.

### {"date": "2017-01-03", "tmin": 47, "tmax": 56, "prcp": 0.31, "snow": 0.0, "snwd": 0.0, "awnd": 5.37},
### {"date": "2017-01-07", "tmin": 20, "tmax": 32, "prcp": 0.6, "snow": 0.5, "snwd": 0.0, "awnd": 9.62}

file_path = os.path.join("2025-03-10", "rdu-weather-history.json")
# print(os.path.isfile(file_path))
# print(os.getcwd())

with open(file_path) as file:
    weather_data = json.load(file)

### the filter() function gives us a way to remove unwanted data points
# snow_days = list(filter(lambda x: x['snow'] > 0.0, weather_data))
# print(len(weather_data))
# print(len(snow_days))

### filter can also be used on non-numerical data, like strings
### create a subset that contains summer days with heavy rain (more than 1 in, )
def is_summer_rain_day(d): # => in fact, filtering rainy summer day
    summer_months = ["-07-", "-08-"]
    if any(m in d['date'] for m in summer_months) and d['prcp'] >= 1.0:
        return True
    return False

summer_raindays = list(filter(is_summer_rain_day, weather_data))
print(len(summer_raindays))
pprint.pp(summer_raindays)

###output
# 15
# [{'date': '2017-08-08',
#   'tmin': 68,
#   'tmax': 78,
#   'prcp': 1.5,
#   'snow': 0.0,
#   'snwd': 0.0,
#   'awnd': 5.82},
#  {'date': '2018-08-03',
#   'tmin': 70,
#   'tmax': 77,
#   'prcp': 1.12,
#   'snow': 0.0,
#   'snwd': 0.0,
#   'awnd': 5.82},


##################################################
### So now we've seen two ways of filtering data.
### One using a list comprehension and
### one using the filter function.
### So I'd like to use list comprehensions
### when the filtering logic is simple
### and has maybe one or at the most two conditions.
### And I'll prefer using the filter function
### when the filtering logic gets more complex
### than can just fit into one line of code.
##################################################

import os
import json

file_path = os.path.join("C:/Users/.../MyPrj/2024-11-07", "history_weather.json")
# print(os.path.exists(file_path))

with open(file_path, 'r') as f:
    weather_data = json.load(f)

summer_months = ["-07-", "-08-"]
summer_rainydays = []

for data in weather_data:
### any()의 대상은 data가 아니라, summer_months이다. 이 비교 대상 집합 중 하나라도 걸리면...
    if any(m in data['date'] for m in summer_months) and data['prcp'] >= 1.0:
        summer_rainydays.append(data)
print(len(summer_rainydays))
# print(summer_rainydays)

# filter 함수의 인자로 2차원 이터러블을 넣어도, 하나씩 분리해서 함수에 던진다.
def is_summer_rainyday(d):
    summer_months = ["-07-", "-08-"]
    if any(m in d['date'] for m in summer_months) and d['prcp'] >= 1.0:
            return True
    return False

summer_rainydays = []
summer_rainydays = list(filter(is_summer_rainyday, weather_data))
print(len(summer_rainydays))

###output
# 15
# 15

### Get cold windy rainy days
def get_cold_windy_rainy_days():
    import os
    import json

    file_path = os.path.join("C:/Users/.../MyPrj/2024-11-07", "history_weather.json")
    with open(file_path, 'r') as f:
        weather_data = json.load(f)

    def is_cold_windy_rainy_day(d):
        avg_temp = d['tmax'] + d['tmin'] / 2
        total_prcp = d['prcp'] + d['snow']
        if avg_temp < 45 and total_prcp > 0.7 and d['awnd'] >= 10.0:
            return True
        return False
        
    blustery_days = list(filter(is_cold_windy_rainy_day, weather_data))
    return blustery_days

print(get_cold_windy_rainy_days())
###output
###[{'date': '2022-01-21', 'tmin': 22, 'tmax': 30, 'prcp': 0.15, 'snow': 1.5, 'snwd': 2.0, 'awnd': 10.7}]

### It's probably not a surprise to you
### that sorting is one of the most common data operations.

import os
import json
# import pprint

file_path = os.path.join("C:/Users/SKTelecom/UserApps/PyScripter/MyPrj/2024-11-07", "history_weather.json")
# print(os.path.exists(file_path))

with open(file_path, 'r') as f:
    weather_data = json.load(f)

### create a subset of the data a for days that had snowfall
snow_days = [data for data in weather_data if data['snow'] > 0]
print(len(snow_days))
sorted_snow_days = sorted(snow_days, key=lambda d: d['snow'], reverse=True)
last_idx = len(snow_days)
for s in sorted_snow_days:
    if sorted_snow_days.index(s) == last_idx-1:
        print(s['snow'])
    else:
        print(s['snow'], end=' / ')

### Sort on multiple fields: first by snowfall, then by average wind speed.
sorted_dataset = sorted(snow_days, key=lambda d:(d['snow'], d['awnd']))
print(len(sorted_dataset))
for i, d in enumerate(sorted_dataset):
    print(f"[{i+1:02d}] snow: {d['snow']} awnd: {d['awnd']}")

###output
# 15
# 7.0 / 5.9 / 2.5 / 1.9 / 1.6 / 1.5 / 1.4 / 0.9 / 0.79 / 0.5 / 0.39 / 0.3 / 0.3 / 0.2 / 0.2
# 15
# [01] snow: 0.2 awnd: 3.58
# [02] snow: 0.2 awnd: 5.59
# [03] snow: 0.3 awnd: 4.92
# [04] snow: 0.3 awnd: 5.82
# [05] snow: 0.39 awnd: 8.7
# [06] snow: 0.5 awnd: 9.62
# [07] snow: 0.79 awnd: 3.8
# [08] snow: 0.9 awnd: 4.25
# [09] snow: 1.4 awnd: 7.16
# [10] snow: 1.5 awnd: 10.7
# [11] snow: 1.6 awnd: 8.72
# [12] snow: 1.9 awnd: 7.61
# [13] snow: 2.5 awnd: 6.49
# [14] snow: 5.9 awnd: 5.82
# [15] snow: 7.0 awnd: 15.21

### Using built-in map function to transform from the original data to wanted format.

import os
import copy
import json
import pprint

file_path = os.path.join("C:/Users/.../2024-11-07", "history_weather.json")

with open(file_path, 'r') as f:
    weather_data = json.load(f)

### Convert the weather data from imperial to metric units.
def ToC(f):
    f = 0 if f is None else f
    return (f-32) * 5/9

def ToMM(i):
    i = 0 if i is None else i
    return i*25.4

def ToKPH(s):
    s = 0 if s is None else s
    return s * 1.60934

def ToMetric(wd):
    new_wd = copy.copy(wd)
    new_wd['tmin'] = ToC(wd['tmin'])
    new_wd['tmax'] = ToC(wd['tmax'])
    new_wd['prcp'] = ToMM(wd['prcp'])
    new_wd['snow'] = ToMM(wd['snow'])
    new_wd['snwd'] = ToMM(wd['snwd'])
    new_wd['awnd'] = ToKPH(wd['awnd'])
    return new_wd

metric_weather = list(map(ToMetric, weather_data))
pprint.pp(weather_data[0])
pprint.pp(metric_weather[0])

# convert objects to tuple
avg_temp = lambda t1, t2: (t1+t2)/2.0
tuple_data = list(map(lambda d:(d['date'], avg_temp(d['tmax'], d['tmin'])), weather_data))
print(tuple_data[0:2])

###output
# {'date': '2017-01-03',
 # 'tmin': 47,
 # 'tmax': 56,
 # 'prcp': 0.31,
 # 'snow': 0.0,
 # 'snwd': 0.0,
 # 'awnd': 5.37}
# {'date': '2017-01-03',
 # 'tmin': 8.333333333333334,
 # 'tmax': 13.333333333333334,
 # 'prcp': 7.874,
 # 'snow': 0.0,
 # 'snwd': 0.0,
 # 'awnd': 8.6421558}
# [('2017-01-03', 51.5), ('2017-01-07', 26.0)]

import json
import pprint

def get_day_temp_description():
    file_path = os.path.join("C:/Users/.../2024-11-07", "history_weather.json")
    with open(file_path, 'r') as f:
        weather_data = json.load(f)
    
    def average_temp_to_desc(d):
        avg_temp = (d['tmin'] + d['tmax'])/2
        desc = ""
        if avg_temp <= 60:
            desc = "cold"
        elif avg_temp > 60 and avg_temp < 80:
            desc = "warm"
        else:
            desc = "hot"
        return (d['date'], desc)
    
    new_data = list(map(average_temp_to_desc, weather_data))
    return new_data

desc = get_day_temp_description()
pprint.pp(desc[0:5])

###output
# [('2017-01-03', 'cold'),
 # ('2017-01-07', 'cold'),
 # ('2017-01-10', 'cold'),
 # ('2017-01-13', 'warm'),
 # ('2017-01-15', 'cold')]