I have trained a model and deployed as endpoint using aws sagemaker and when I tried to invoke I have got error:
2025-09-09 14:58:25.724914: I external/org_tensorflow/tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: FAILED_PRECONDITION: Could not find variable lstm_model/dense/bias. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status error message=Resource localhost/lstm_model/dense/bias/N10tensorflow3VarE does not exist.
train.py code
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import joblib
from io import BytesIO
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import sagemaker
import boto3
import logging
import openpyxl
# setting up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("model_aiops.py execution started")
def main():
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('--train-s3-path', type=str, required=True)
parser.add_argument('--test-s3-path', type=str, required=True)
parser.add_argument('--output-s3-path', type=str, required=True)
parser.add_argument('--model_dir', type=str, default='/opt/ml/model')
args = parser.parse_args()
# Download Excel file from S3
s3 = boto3.client('s3')
train_bucket, train_key = args.train_s3_path.replace("s3://", "").split("/", 1)
train_excel_obj = s3.get_object(Bucket=train_bucket, Key=train_key)
train_excel_data = train_excel_obj['Body'].read()
test_bucket, test_key = args.test_s3_path.replace("s3://", "").split("/", 1)
test_excel_obj = s3.get_object(Bucket=test_bucket, Key=test_key)
test_excel_data = test_excel_obj['Body'].read()
# Load the Excel file
train_df = pd.read_excel(BytesIO(train_excel_data), engine='openpyxl')
test_df = pd.read_excel(BytesIO(test_excel_data), engine='openpyxl')
# Setup
id_cols = ['ExComm Leader', 'Sector', 'Cloud Vendor']
train_months = [col for col in train_df.columns if col not in id_cols]
forecast_months = ['May 2025', 'Jun 2025', 'Jul 2025']
month_keys = ['may', 'jun', 'jul']
# Prepare results
forecast_results = []
actual_binary_all = []
predicted_binary_all = []
# Create output directory for plots
os.makedirs("comparison_plots_lstm", exist_ok=True)
# Forecast loop
for keys, group in train_df.groupby(id_cols):
leader, sector, vendor = keys
train_values = group[train_months].values.flatten()
train_dates = pd.to_datetime(train_months, format='%b %Y', errors='coerce')
# Filter training data from Jan 2022 to Apr 2025
mask = (train_dates >= pd.to_datetime('2022-01-01')) & (train_dates <= pd.to_datetime('2025-04-30'))
filtered_values = train_values[mask]
if len(filtered_values) < 12:
continue
# Normalize data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(filtered_values.reshape(-1, 1))
# Prepare sequences
X, y = [], []
for i in range(3, len(scaled_data)):
X.append(scaled_data[i-3:i])
y.append(scaled_data[i])
X, y = np.array(X), np.array(y)
# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=100, verbose=0)
# Forecast next 3 months
last_sequence = scaled_data[-3:]
predictions = []
current_input = last_sequence.copy()
for _ in range(3):
pred = model.predict(current_input.reshape(1, 3, 1), verbose=0)
predictions.append(pred[0][0])
current_input = np.append(current_input[1:], pred, axis=0)
# Inverse transform predictions
y_pred = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()
result = {'ExComm Leader': leader, 'Sector': sector, 'Cloud Vendor': vendor}
# Get actual values from test data
actual_row = test_df[
(test_df['ExComm Leader'] == leader) &
(test_df['Sector'] == sector) &
(test_df['Cloud Vendor'] == vendor)
]
accuracy_values = []
total = 0
actual_values = []
predicted_values = []
for key, month, pred_val in zip(month_keys, forecast_months, y_pred):
actual_val = actual_row.iloc[0][month] if not actual_row.empty and month in actual_row.columns else None
result[f"actual_{key}"] = actual_val
result[f"predicted_{key}"] = pred_val
if pd.notna(actual_val) and pd.notna(pred_val):
accuracy = (1 - abs(actual_val - pred_val) / actual_val) * 100
result[f"accuracy_{key}"] = round(accuracy, 2)
accuracy_values.append(accuracy)
total += pred_val
threshold = actual_val * 0.05
actual_binary_all.append(1 if actual_val > threshold else 0)
predicted_binary_all.append(1 if pred_val > threshold else 0)
actual_values.append(actual_val)
predicted_values.append(pred_val)
result['Total'] = total
result['model_accuracy'] = round(np.mean(accuracy_values), 2) if accuracy_values else None
result['F1_score_overall'] = round(f1_score(actual_binary_all, predicted_binary_all), 2) if actual_binary_all else None
forecast_results.append(result)
# Visualization
if actual_values and predicted_values:
months = forecast_months
plt.figure(figsize=(10, 6))
plt.plot(months, actual_values, marker='o', label='Actual', color='green')
plt.plot(months, predicted_values, marker='o', label='Predicted', color='blue')
plt.title(f"{leader} | {sector} | {vendor}", fontsize=14)
plt.xlabel("Month", fontsize=12)
plt.ylabel("Cost (USD)", fontsize=12)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.xticks(rotation=45)
filename = f"comparison_plots_lstm/{leader}_{sector}_{vendor}_May_Jul_2025_LSTM.png".replace(" ", "_")
plt.savefig(filename)
plt.close()
# Define local folder and S3 path
local_folder = "comparison_plots_lstm"
# Parse bucket and key prefix
output_bucket, output_key = args.output_s3_path.replace("s3://", "").split("/", 1)
# Walk through the folder and upload each file
for root, _, files in os.walk(local_folder):
for file in files:
local_path = os.path.join(root, file)
relative_path = os.path.relpath(local_path, local_folder)
s3_key = os.path.join(output_key, local_folder, relative_path).replace("\\", "/")
# Upload file
s3.upload_file(local_path, output_bucket, s3_key)
print(f"Uploaded {local_path} to s3://{output_bucket}/{s3_key}")
# Save results
forecast_df = pd.DataFrame(forecast_results)
# forecast_df.to_excel("Predict_group_forecast_LSTM.xlsx", index=False)
output_buffer = BytesIO()
forecast_df.to_excel(output_buffer, index=False, engine='openpyxl')
output_buffer.seek(0)
s3.upload_fileobj(output_buffer, output_bucket, output_key + "/Predict_group_forecast_LSTM.xlsx")
# save model to s3
@tf.function(input_signature=[tf.TensorSpec([None, X.shape[1], X.shape[2]], tf.float32)])
def serving_fn(inputs):
return {'outputs': model(inputs)}
model_dir = os.environ.get('SM_MODEL_DIR', '/opt/ml/model')
model_path = os.path.join(model_dir, '1')
tf.saved_model.save(model, model_path, signatures={'serving_default': serving_fn})
# tf.saved_model.save(model, model_path)
print("checking the model signature keys")
model_saved = tf.saved_model.load('/opt/ml/model/1')
print(list(model_saved.signatures.keys()))
infer = model_saved.signatures["serving_default"]
print(infer.structured_input_signature)
for var in infer.variables:
print(var.name)
# Print overall F1 score
overall_f1 = f1_score(actual_binary_all, predicted_binary_all) if actual_binary_all else None
print("Overall F1 Score:", round(overall_f1, 2) if overall_f1 is not None else "N/A")
print("Forecasting for May to July 2025 using LSTM completed.")
if __name__ == "__main__":
main()
estimator.py
import sagemaker
import os
import boto3
from sagemaker.tensorflow import TensorFlow
# Get arguments from env variables
print(os.getenv("AWS_REGION"))
region = os.getenv('AWS_REGION', 'us-east-1')
bucket = os.getenv('S3_BUCKET')
aws_session = boto3.Session(
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID"),
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY"),
region_name = region
)
# Create SageMaker session and get execution role
sagemaker_session = sagemaker.Session(boto_session=aws_session)
role = os.getenv("SAGEMAKER_ROLE")
# Define S3 paths
source_dir_path = f's3://{bucket}/cloud_aiops/cloud_aiops.tar.gz'
train_s3_path = f's3://{bucket}/dataset/AZURE_DATASET_TRAIN.xlsx'
test_s3_path = f's3://{bucket}/dataset/AZURE_DATASET_TEST.xlsx'
output_s3_path = f's3://{bucket}/output'
# Create the Estimator
estimator = TensorFlow(
entry_point='model_aiops.py', # Your training script
source_dir=source_dir_path, # Folder containing train.py
role=role,
instance_count=1,
instance_type='ml.m5.large',
framework_version='2.18',
py_version='py310',
hyperparameters={
'train-s3-path': train_s3_path,
'test-s3-path': test_s3_path,
'output-s3-path': output_s3_path
},
sagemaker_session=sagemaker_session
)
# Launch the training job
estimator.fit()
# Deploy the trained model as a Sagemaker Endpoint
predictor = estimator.deploy(
initial_instance_count=1,
instance_type="ml.m5.large",
endpoint_name="tf-endpoint"
)
I have code block in my train.py to test the signature and variables and I got appropriate output, it is throwing error when I invoke endpoint:
print("checking the model signature keys")
model_saved = tf.saved_model.load('/opt/ml/model/1')
print(list(model_saved.signatures.keys()))
infer = model_saved.signatures["serving_default"]
print(infer.structured_input_signature)
for var in infer.variables:
print(var.name)
I have tried to save the model without /1 directory but while deploying sagemaker unable to find the SavedModel. Couldn't change my training model logic so need help to figure the root cause.