Toy Schedules Generation¶
Generate a population (schedules and attributes). Where every schedule is home -> ACTIVITY -> home, with varying durations. Where ACTIVITY depends on work_status attribute.
In [9]:
Copied!
from pathlib import Path
import pandas as pd
from numpy.random import choice
import random
from caveat.data.synth import ActivityGen
from caveat.data.utils import generate_population, trace_to_pam
from caveat.evaluate.describe.times import (
joint_time_distributions_plot,
times_distributions_plot,
)
from caveat.evaluate.describe.transitions import sequence_prob_plot
from pathlib import Path
import pandas as pd
from numpy.random import choice
import random
from caveat.data.synth import ActivityGen
from caveat.data.utils import generate_population, trace_to_pam
from caveat.evaluate.describe.times import (
joint_time_distributions_plot,
times_distributions_plot,
)
from caveat.evaluate.describe.transitions import sequence_prob_plot
In [10]:
Copied!
n = 10000
ps = {
"employed": [0.8, 0.0, 0.2],
"student": [0.2, 0.6, 0.2],
"unemployed": [0.1, 0.1, 0.8],
}
schedules_write_path = Path("tmp/toy_noisy_schedules.csv")
attributes_write_path = Path("tmp/toy_noisy_attributes.csv")
n = 10000
ps = {
"employed": [0.8, 0.0, 0.2],
"student": [0.2, 0.6, 0.2],
"unemployed": [0.1, 0.1, 0.8],
}
schedules_write_path = Path("tmp/toy_noisy_schedules.csv")
attributes_write_path = Path("tmp/toy_noisy_attributes.csv")
In [20]:
Copied!
pid = []
pids = []
acts = []
starts = []
ends = []
durations = []
work_statuses = []
for i in range(n):
pid.append(i)
work_status = random.choice(["employed", "student", "unemployed"])
probs = ps
activity = choice(["work", "education", "other"], p=probs[work_status])
work_statuses.append(work_status)
budget = 1440
sequence = ["home"]
seq_durations = []
if activity in ["work", "education"]:
departure_time = random.randint(400, 500)
else:
departure_time = random.randint(500, 800)
seq_durations.append(departure_time)
budget -= departure_time
sequence.append(activity)
if activity == "other":
act_duration = random.randint(60, 180)
elif activity == "work":
act_duration = random.randint(400, 600)
else:
act_duration = random.randint(300, 500)
seq_durations.append(act_duration)
budget -= act_duration
if budget > 600:
sequence.append("home")
home_duration = random.randint(60, 240)
seq_durations.append(home_duration)
budget -= home_duration
sequence.append("other")
act_duration = random.randint(60, 180)
seq_durations.append(act_duration)
budget -= act_duration
sequence.append("home")
seq_durations.append(budget)
home_duration = budget
t = 0
for act, duration in zip(
sequence,
seq_durations,
):
pids.append(i)
acts.append(act)
starts.append(t)
t += duration
ends.append(t)
durations.append(duration)
schedules = pd.DataFrame(
{
"pid": pids,
"act": acts,
"start": starts,
"end": ends,
"duration": durations,
}
)
attributes = pd.DataFrame(
{
"pid": pid,
"work_status": work_statuses,
}
)
print(schedules.head(20))
print(attributes.head())
pid = []
pids = []
acts = []
starts = []
ends = []
durations = []
work_statuses = []
for i in range(n):
pid.append(i)
work_status = random.choice(["employed", "student", "unemployed"])
probs = ps
activity = choice(["work", "education", "other"], p=probs[work_status])
work_statuses.append(work_status)
budget = 1440
sequence = ["home"]
seq_durations = []
if activity in ["work", "education"]:
departure_time = random.randint(400, 500)
else:
departure_time = random.randint(500, 800)
seq_durations.append(departure_time)
budget -= departure_time
sequence.append(activity)
if activity == "other":
act_duration = random.randint(60, 180)
elif activity == "work":
act_duration = random.randint(400, 600)
else:
act_duration = random.randint(300, 500)
seq_durations.append(act_duration)
budget -= act_duration
if budget > 600:
sequence.append("home")
home_duration = random.randint(60, 240)
seq_durations.append(home_duration)
budget -= home_duration
sequence.append("other")
act_duration = random.randint(60, 180)
seq_durations.append(act_duration)
budget -= act_duration
sequence.append("home")
seq_durations.append(budget)
home_duration = budget
t = 0
for act, duration in zip(
sequence,
seq_durations,
):
pids.append(i)
acts.append(act)
starts.append(t)
t += duration
ends.append(t)
durations.append(duration)
schedules = pd.DataFrame(
{
"pid": pids,
"act": acts,
"start": starts,
"end": ends,
"duration": durations,
}
)
attributes = pd.DataFrame(
{
"pid": pid,
"work_status": work_statuses,
}
)
print(schedules.head(20))
print(attributes.head())
pid act start end duration 0 0 home 0 669 669 1 0 other 669 803 134 2 0 home 803 1009 206 3 0 other 1009 1101 92 4 0 home 1101 1440 339 5 1 home 0 464 464 6 1 education 464 869 405 7 1 home 869 1440 571 8 2 home 0 581 581 9 2 other 581 645 64 10 2 home 645 786 141 11 2 other 786 942 156 12 2 home 942 1440 498 13 3 home 0 737 737 14 3 other 737 909 172 15 3 home 909 1440 531 16 4 home 0 459 459 17 4 work 459 927 468 18 4 home 927 1440 513 19 5 home 0 460 460 pid work_status 0 0 student 1 1 student 2 2 unemployed 3 3 unemployed 4 4 unemployed
In [21]:
Copied!
schedules_write_path.parent.mkdir(exist_ok=True)
attributes_write_path.parent.mkdir(exist_ok=True)
schedules.to_csv(schedules_write_path, index=False)
attributes.to_csv(attributes_write_path, index=False)
schedules_write_path.parent.mkdir(exist_ok=True)
attributes_write_path.parent.mkdir(exist_ok=True)
schedules.to_csv(schedules_write_path, index=False)
attributes.to_csv(attributes_write_path, index=False)
In [22]:
Copied!
def describe_col(population, col: str) -> pd.DataFrame:
description = population.groupby("act")[col].describe()[
["count", "mean", "std", "min", "max"]
]
description["attribute"] = col
return description
def describe_cols(population, cols: list[str]) -> pd.DataFrame:
description = pd.concat(
[describe_col(population, c) for c in cols], ignore_index=False
)
description = description.reset_index().set_index(["attribute", "act"])
return description
describe_cols(schedules, ["start", "end", "duration"]).round()
def describe_col(population, col: str) -> pd.DataFrame:
description = population.groupby("act")[col].describe()[
["count", "mean", "std", "min", "max"]
]
description["attribute"] = col
return description
def describe_cols(population, cols: list[str]) -> pd.DataFrame:
description = pd.concat(
[describe_col(population, c) for c in cols], ignore_index=False
)
description = description.reset_index().set_index(["attribute", "act"])
return description
describe_cols(schedules, ["start", "end", "duration"]).round()
Out[22]:
count | mean | std | min | max | ||
---|---|---|---|---|---|---|
attribute | act | |||||
start | education | 2336.0 | 450.0 | 29.0 | 400.0 | 500.0 |
home | 24118.0 | 527.0 | 454.0 | 0.0 | 1242.0 | |
other | 8124.0 | 773.0 | 151.0 | 500.0 | 1078.0 | |
work | 3658.0 | 450.0 | 29.0 | 400.0 | 500.0 | |
end | education | 2336.0 | 850.0 | 64.0 | 705.0 | 993.0 |
home | 24118.0 | 969.0 | 424.0 | 400.0 | 1440.0 | |
other | 8124.0 | 893.0 | 155.0 | 560.0 | 1242.0 | |
work | 3658.0 | 949.0 | 64.0 | 801.0 | 1099.0 | |
duration | education | 2336.0 | 400.0 | 59.0 | 300.0 | 500.0 |
home | 24118.0 | 442.0 | 165.0 | 60.0 | 800.0 | |
other | 8124.0 | 119.0 | 35.0 | 60.0 | 180.0 | |
work | 3658.0 | 499.0 | 58.0 | 400.0 | 600.0 |
In [23]:
Copied!
def time_distributions(population: pd.DataFrame, mapping: dict):
starts = {k: [] for k in mapping.values()}
ends = {k: [] for k in mapping.values()}
durations = {k: [] for k in mapping.values()}
for act, acts in population.groupby("act"):
starts[act] = list(acts.start)
ends[act] = list(acts.end)
durations[act] = list(acts.duration)
return starts, ends, durations
def time_distributions(population: pd.DataFrame, mapping: dict):
starts = {k: [] for k in mapping.values()}
ends = {k: [] for k in mapping.values()}
durations = {k: [] for k in mapping.values()}
for act, acts in population.groupby("act"):
starts[act] = list(acts.start)
ends[act] = list(acts.end)
durations[act] = list(acts.duration)
return starts, ends, durations
In [24]:
Copied!
_ = times_distributions_plot(schedules, ys={})
_ = times_distributions_plot(schedules, ys={})
In [25]:
Copied!
_ = joint_time_distributions_plot(schedules, ys={})
_ = joint_time_distributions_plot(schedules, ys={})
In [26]:
Copied!
_ = sequence_prob_plot(schedules, ys={}, figsize=(8, 6))
_ = sequence_prob_plot(schedules, ys={}, figsize=(8, 6))
In [ ]:
Copied!