directory
- Extract the periodic characteristics of the periodic sequence for prediction
- Determine cycle, calculation cycle factor
Method 1: Except the weekly value, take the median according to the column
Method 2: Seasoning index calculation method, obtain each working day or weekend average, and then divide the overall average - calculation base
- Forecast = Base*cycle factor
** Observe the sequence, when the sequence presents periodic changes, the cycle factor method can be used as the baseline **
- Determine cycle, calculation cycle factor
- How to predict the daily situation of next month
- Get the average of the daily (1-31)
- statistics (Monday to Sunday) Daily frequency
- Based on the weighing factors based on the weekly cycle factor
- Forecast according to factor and daily average
-
Selection period
- Training set period: 2014-03-01 ~ 2014-08-03
- Test Data set: 2014-08-04 ~ 2014-08-31
-
Import data packet
import pandas as pd
import numpy as np
import sklearn as skr
import datetime
import matplotlib as plt
import seaborn as sns
from dateutil import relativedelta
- Read data
def load_data(file_path):
data_balance = pd.read_csv(file_path)
data_balance = add_timestamp(data_balance,"report_date")
return data_balance.reset_index(drop=True)
# Add time stamp to the dataset
def add_timestamp(data,date):
data_balance = data.copy()
data_balance["date"] = pd.to_datetime(data_balance[date],format="%Y%m%d")
data_balance["day"] = data_balance["date"].dt.day
data_balance["month"]= data_balance["date"].dt.month
data_balance["year"] = data_balance["date"].dt.year
data_balance["week"] = data_balance["date"].dt.week
data_balance["weekday"] = data_balance["date"].dt.weekday
return data_balance.reset_index(drop=True)
# Total amount of subscription/redemption amount according to the date statistics
def total_amt(data,date):
data_temp = data.copy()
data_temp = data.groupby("date",as_index=False)["total_purchase_amt","total_redeem_amt"].sum()
return data_temp[data_temp["date"]>=date].reset_index(drop=True)
# Generate test data
def generate_data(data,start_date,end_date):
total_balance = data.copy()
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
test_data=[]
while start_date!=end_date:
temp = [start_date,np.nan,np.nan]
test_data.append(temp)
start_date += datetime.timedelta(days = 1)
test_data = pd.DataFrame(test_data)
test_data.columns = total_balance.columns
total_balance = pd.concat([total_balance,test_data],axis=0)
return total_balance.reset_index(drop=True)
user_balance_file_path = r"./Data/user_balance_table.csv"
user_info_file_path = r"./Data/user_profile_table.csv"
data_balance = load_data(user_balance_file_path)
total_balance = total_amt(data_balance,"2014-03-01")
total_balance = generate_data(total_balance,"2014-08-04","2014-8-31")
total_balance = add_timestamp(total_balance,"date")
# Define the method of predicting the prediction of time sequence rules
def generate_base(data,month_index):
# Select a short -term data set
total_balance = data.copy()
total_balance = total_balance[["date","total_purchase_amt","total_redeem_amt"]]
total_balance = total_balance[(total_balance["date"]>="2014-03-01")&(total_balance["date"]<pd.Timestamp(2014,month_index,1))]
# Add time stamp
total_balance["day"] = total_balance["date"].dt.day
total_balance["month"] = total_balance["date"].dt.month
total_balance["week"] = total_balance["date"].dt.week
total_balance["weekday"] = total_balance["date"].dt.weekday
# Statistical daily factors, according to the average value of the weekly convergence/the mean of all data
mean_of_each_weekday = total_balance[["weekday","total_purchase_amt","total_redeem_amt"]].groupby("weekday",as_index=False).mean()
for name in ["total_purchase_amt","total_redeem_amt"]:
mean_of_each_weekday = mean_of_each_weekday.rename(columns={
name:name+"_weekdaymean"})
mean_of_each_weekday["total_purchase_amt_weekdaymean"] /=np.mean(total_balance["total_purchase_amt"])
mean_of_each_weekday["total_redeem_amt_weekdaymean"] /=np.mean(total_balance["total_redeem_amt"])
# Put the statistical results left to the original data concentration
total_balance = pd.merge(total_balance,mean_of_each_weekday,on="weekday",how="left")
# 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
weekday_count = total_balance[["date","weekday","day"]].groupby(["day","weekday"],as_index=False).count()
weekday_count = pd.merge(weekday_count,mean_of_each_weekday,on="weekday")
# According to the frequency, the daily factors are weighed, and the date factor is obtained
weekday_count["total_purchase_amt_weekdaymean"] *= weekday_count["date"]/len(np.unique(total_balance["month"]))
weekday_count["total_redeem_amt_weekdaymean"]*=weekday_count["date"]/len(np.unique(total_balance["month"]))
day_rate = weekday_count.drop(["weekday","date"],axis=1).groupby("day",as_index=False).sum()
# Excluding the average date of the test data set, the dated residual of the date of the date obtained BASE
day_mean = total_balance[["day","total_purchase_amt","total_redeem_amt"]].groupby("day",as_index=False).mean()
day_pre = pd.merge(day_mean,day_rate,on="day",how="left")
day_pre["total_purchase_amt"] /=day_pre["total_purchase_amt_weekdaymean"]
day_pre["total_purchase_amt"] /=day_pre["total_redeem_amt_weekdaymean"]
# Generate test data set
for index,row in day_pre.iterrows():
if month_index in (2,4,6,9) and row["day"]==31:
break
day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))
# Calculate the final prediction results based on Base and Daily factor
day_pre["weekday"] = day_pre["date"].dt.weekday
day_pre = day_pre[["date","weekday","total_purchase_amt","total_redeem_amt"]]
day_pre = pd.merge(day_pre,mean_of_each_weekday,on="weekday")
day_pre["total_purchase_amt"]*=day_pre["total_purchase_amt_weekdaymean"]
day_pre["total_purchase_amt"]*=day_pre["total_redeem_amt_weekdaymean"]
day_pre = day_pre.sort_values("date")[["date","total_purchase_amt","total_redeem_amt"]]
return day_pre