I have the following dataset.
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
random.seed(0)
df = pd.DataFrame({'DATE_TIME': pd.date_range('2022-11-01', '2022-11-06 23:00:00', freq='20min'),
'ID': [random.randrange(1, 3) for n in range(430)]})
df['VALUE1'] = [random.uniform(110, 160) for n in range(430)]
df['VALUE2'] = [random.uniform(50, 80) for n in range(430)]
df['INSPECTION'] = df['DATE_TIME'].dt.day
# df['INSPECTION'] = df['INSPECTION'].replace(6, 1)
# df['INSPECTION'] = df['INSPECTION'].replace(3, 1)
df['MODE'] = np.select([df['INSPECTION'] == 1, df['INSPECTION'].isin([2, 3])], ['A', 'B'], 'C')
df['TIME'] = df['DATE_TIME'].dt.time
df['TIME'] = df['TIME'].astype('str')
df['TIMEINTERVAL'] = df.DATE_TIME.diff().astype('timedelta64[m]')
df['TIMEINTERVAL'] = df['TIMEINTERVAL'].fillna(0)
def to_day_period(s):
bins = ['0', '06:00:00', '13:00:00', '18:00:00', '23:00:00', '24:00:00']
labels = ['Nighttime', 'Daytime', 'Daytime', 'Nighttime', 'Nighttime']
return pd.cut(
pd.to_timedelta(s),
bins=list(map(pd.Timedelta, bins)),
labels=labels, right=False, ordered=False
)
df['TIME_OF_DAY'] = to_day_period(df['TIME'])
df_monthly = df
I would like to sort the DATE_TIME column in a way that it starts from 12:00:00 and ends at 11:59:59. How can I achieve this? I tried shifting DATE_TIME object for 12 hours and applying np.argsort on VALUE1 column, but his did not work.
For ID=1 and INSPECTION=1, suppose I have the following data:
| DATE_TIME | VALUE1 |
|---|---|
| 2022-11-01 00:00:00 | 55 |
| 2022-11-01 11:00:00 | 45 |
| 2022-11-01 12:00:00 | 40 |
| 2022-11-01 22:00:00 | 35 |
| 2022-11-01 23:00:00 | 70 |
What I expect:
| DATE_TIME | VALUE1 |
|---|---|
| 2022-11-01 12:00:00 | 40 |
| 2022-11-01 22:00:00 | 35 |
| 2022-11-01 23:00:00 | 70 |
| 2022-11-01 00:00:00 | 55 |
| 2022-11-01 11:00:00 | 45 |
>Solution :
Use numpy.lexsort with dt.normalize and the 12h subtraction that you suggested (only keeping the time):
import numpy as np
order = np.lexsort([df['DATE_TIME'].sub(pd.Timedelta('12h')).dt.time,
df['DATE_TIME'].dt.normalize()])
out = df.iloc[order]
Using a function to benefit from sort_values‘ key parameter:
def custom_date_sorter(s):
s = pd.to_datetime(s)
return np.argsort(np.lexsort([s.sub(pd.Timedelta('12h')).dt.time,
s.dt.normalize()]))
out = df.sort_values(by='DATE_TIME', key=custom_date_sorter)
Output:
DATE_TIME VALUE1
2 2022-11-01 12:00:00 40
3 2022-11-01 22:00:00 35
4 2022-11-01 23:00:00 70
0 2022-11-01 00:00:00 55
1 2022-11-01 11:00:00 45
Alternatively:
(df.assign(date=df['DATE_TIME'].dt.normalize(),
time=df['DATE_TIME'].sub(pd.Timedelta('12h')).dt.time)
.sort_values(by=['date', 'time'])
.drop(columns=['date', 'time'])
)