How should we learn from data obtained by merging partial time series data?

14 Views Asked by At

In my case, I am using 6816 data points obtained by merging 48 time series data 142 times as training and testing data.

In the case of the 'buffer size' feature, the learning accuracy is particularly low because it returns to the initial state of 4.0 in the 48nth section.

Below is the code that learned and predicted the features 'bandwidth', 'throughput', and 'buffer_size'.

I can also give you code to merge the data or that data file if needed.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

plt.rcParams['font.family'] = 'Malgun Gothic'

# Define function to create sequences for LSTM
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i + n_steps, :])
        y.append(data[i + n_steps, 0])  # 'index' is the target variable
    return np.array(X), np.array(y)

def create_sequences_1(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i + n_steps, :])
        y.append(data[i + n_steps, 1])  # 'index' is the target variable
    return np.array(X), np.array(y)

def create_sequences_2(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i + n_steps, :])
        y.append(data[i + n_steps, 2])  # 'index' is the target variable
    return np.array(X), np.array(y)

# 파일 경로 설정
file_path = '0102 merge data num/data.txt'

#'rownum',  'bandwidth',  'time',  'index', 'actual_throughput', 'delay', 'buffer_size', 'rebuf', 'chunk_size'

# 텍스트 파일을 DataFrame으로 읽어들이기
column_names = ['rownum', 'bandwidth','actual_throughput', 'buffer_size', 'timing', 'index','delay','rebuf']
df = pd.read_csv(file_path, sep='\t', header=None, names=column_names)

# Select the columns for training
columns_to_train = ['bandwidth','actual_throughput', 'buffer_size', 'timing', 'index','delay','rebuf']
data = df[columns_to_train].values

print("len(data)",len(data))

# Number of time steps to consider for each prediction
n_steps = 5



X_original, y_original = create_sequences(data, n_steps)

print("len(X_original)",len(X_original))
X_train_original, X_test_original = X_original[:5444], X_original[5444:]
y_train_original, y_test_original = y_original[:5444], y_original[5444:]

X1_original, y1_original = create_sequences_1(data, n_steps)
X1_train_original, X1_test_original = X1_original[:5444], X1_original[5444:]
y1_train_original, y1_test_original = y1_original[:5444], y1_original[5444:]

X2_original, y2_original = create_sequences_2(data, n_steps)
X2_train_original, X2_test_original = X2_original[:5444], X2_original[5444:]
y2_train_original, y2_test_original = y2_original[:5444], y2_original[5444:]

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)


# Create sequences for LSTM
X, y = create_sequences(data_scaled, n_steps)
X1, y1 = create_sequences_1(data_scaled, n_steps)
X2, y2 = create_sequences_2(data_scaled, n_steps)


# Split the data into training and testing sets
train_size = int(len(X) * 0.8)
print(train_size)

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

X1_train, X1_test = X1[:train_size], X1[train_size:]
y1_train, y1_test = y1[:train_size], y1[train_size:]

X2_train, X2_test = X2[:train_size], X2[train_size:]
y2_train, y2_test = y2[:train_size], y2[train_size:]

# Build the LSTM model
model = Sequential()
model.add(LSTM(100, activation='tanh', return_sequences=True, input_shape=(n_steps, len(columns_to_train))))
model.add(Dropout(0.2))
model.add(LSTM(100, activation='tanh', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(3))  # 'columns_to_train[0]', 'columns_to_train[1]','columns_to_train[2]' is the target variable
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=48, validation_data=(X_test, y_test), verbose=1)

# model.save('mpc_thp_flow_seq5_lstm_multivaraiable_model(0103,dropout).h5')

# Evaluate the model
loss = model.evaluate(X_test, y_test, verbose=0)
print(f'Model Loss: {loss}')



# Make predictions
predictions = model.predict(X_test)

# Reshape predictions to match the original input shape
predictions = np.hstack((predictions, np.zeros((predictions.shape[0], len(columns_to_train) - 3))))

# Inverse transform the predictions to the original scale
predictions_original_scale = scaler.inverse_transform(predictions)

# Calculate RMSE, MAE, and R2
rmse = np.sqrt(mean_squared_error(y_test, predictions[:, 0]))
mae = mean_absolute_error(y_test, predictions[:, 0])
r2 = r2_score(y_test, predictions[:, 0])

# Print evaluation metrics
print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R2 Score: {r2:.2f}')

# Calculate RMSE, MAE, and R2
rmse1 = np.sqrt(mean_squared_error(y1_test, predictions[:, 1]))
mae1 = mean_absolute_error(y1_test, predictions[:, 1])
r21 = r2_score(y_test, predictions[:, 1])

# Print evaluation metrics
print(f'RMSE: {rmse1:.2f}')
print(f'MAE: {mae1:.2f}')
print(f'R2 Score: {r21:.2f}')

# Calculate RMSE, MAE, and R2
rmse2 = np.sqrt(mean_squared_error(y2_test, predictions[:, 2]))
mae2 = mean_absolute_error(y2_test, predictions[:, 2])
r22 = r2_score(y2_test, predictions[:, 2])

# Print evaluation metrics
print(f'RMSE: {rmse2:.2f}')
print(f'MAE: {mae2:.2f}')
print(f'R2 Score: {r22:.2f}')

# Plot the original 'index' values and the predicted values
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='True Values')
plt.plot(predictions[:, 0], label='Predicted Bandwidth')  # Assuming 'index' is the first column
plt.title('LSTM Model Prediction')
plt.legend()
plt.show()

# Plot the original 'index' values and the predicted values
plt.figure(figsize=(12, 6))
plt.plot(y1_test, label='True Values')
plt.plot(predictions[:, 1], label='Predicted Throughput')  # Assuming 'index' is the first column
plt.title('LSTM Model Prediction')
plt.legend()
plt.show()

# Plot the original 'index' values and the predicted values
plt.figure(figsize=(12, 6))
plt.plot(y2_test, label='True Values')
plt.plot(predictions[:, 2], label='Predicted Buffer Size')  # Assuming 'index' is the first column
plt.title('LSTM Model Prediction')
plt.legend()
plt.show()

# Plot the original 'index' values and the predicted values
plt.figure(figsize=(12, 6))
plt.plot(y_test_original, label='True Values')
plt.plot(predictions_original_scale[:, 0], label='Predicted Bandwidth (unscaled)')  # Assuming 'index' is the first column
plt.title('LSTM Model Prediction')
plt.legend()
plt.show()

# Plot the original 'index' values and the predicted values
plt.figure(figsize=(12, 6))
plt.plot(y1_test_original, label='True Values')
plt.plot(predictions_original_scale[:, 1], label='Predicted Throughput (unscaled)')  # Assuming 'index' is the first column
plt.title('LSTM Model Prediction')
plt.legend()
plt.show()

# Plot the original 'index' values and the predicted values
plt.figure(figsize=(12, 6))
plt.plot(y2_test_original, label='True Values')
plt.plot(predictions_original_scale[:, 2], label='Predicted Buffer Size (unscaled)')  # Assuming 'index' is the first column
plt.title('LSTM Model Prediction')
plt.legend()
plt.show()

As a result of printing RMSE, MAE, and R2 values, Even though the first two columns are merged data, they produce relatively good values. The 'Buffer Size' value in the third column is poor.

'bandwidth' feature prediction result

enter image description here

'throughput' feature prediction result

enter image description here

'buffer size' feature prediction result

enter image description here

I look forward to answers from deep learning experts.

I look forward to answers from deep learning experts.

0

There are 0 best solutions below