0

I have trained a model and deployed as endpoint using aws sagemaker and when I tried to invoke I have got error:

2025-09-09 14:58:25.724914: I external/org_tensorflow/tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: FAILED_PRECONDITION: Could not find variable lstm_model/dense/bias. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status error message=Resource localhost/lstm_model/dense/bias/N10tensorflow3VarE does not exist.

train.py code

import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import joblib
from io import BytesIO
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import sagemaker
import boto3
import logging
import openpyxl

# setting up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("model_aiops.py execution started")

def main():

    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-s3-path', type=str, required=True)
    parser.add_argument('--test-s3-path', type=str, required=True)
    parser.add_argument('--output-s3-path', type=str, required=True)
    parser.add_argument('--model_dir', type=str, default='/opt/ml/model')
    args = parser.parse_args()

    # Download Excel file from S3
    s3 = boto3.client('s3')
    train_bucket, train_key = args.train_s3_path.replace("s3://", "").split("/", 1)
    train_excel_obj = s3.get_object(Bucket=train_bucket, Key=train_key)
    train_excel_data = train_excel_obj['Body'].read()

    test_bucket, test_key = args.test_s3_path.replace("s3://", "").split("/", 1)
    test_excel_obj = s3.get_object(Bucket=test_bucket, Key=test_key)
    test_excel_data = test_excel_obj['Body'].read()

    # Load the Excel file
    train_df = pd.read_excel(BytesIO(train_excel_data), engine='openpyxl')
    test_df = pd.read_excel(BytesIO(test_excel_data), engine='openpyxl')

    # Setup
    id_cols = ['ExComm Leader', 'Sector', 'Cloud Vendor']
    train_months = [col for col in train_df.columns if col not in id_cols]
    forecast_months = ['May 2025', 'Jun 2025', 'Jul 2025']
    month_keys = ['may', 'jun', 'jul']

    # Prepare results
    forecast_results = []
    actual_binary_all = []
    predicted_binary_all = []

    # Create output directory for plots
    os.makedirs("comparison_plots_lstm", exist_ok=True)

    # Forecast loop
    for keys, group in train_df.groupby(id_cols):
        leader, sector, vendor = keys
        train_values = group[train_months].values.flatten()
        train_dates = pd.to_datetime(train_months, format='%b %Y', errors='coerce')

        # Filter training data from Jan 2022 to Apr 2025
        mask = (train_dates >= pd.to_datetime('2022-01-01')) & (train_dates <= pd.to_datetime('2025-04-30'))
        filtered_values = train_values[mask]

        if len(filtered_values) < 12:
            continue

        # Normalize data
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(filtered_values.reshape(-1, 1))

        # Prepare sequences
        X, y = [], []
        for i in range(3, len(scaled_data)):
            X.append(scaled_data[i-3:i])
            y.append(scaled_data[i])
        X, y = np.array(X), np.array(y)

        # Build LSTM model
        model = Sequential()
        model.add(LSTM(50, activation='relu', input_shape=(X.shape[1], X.shape[2])))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')
        model.fit(X, y, epochs=100, verbose=0)

        # Forecast next 3 months
        last_sequence = scaled_data[-3:]
        predictions = []
        current_input = last_sequence.copy()
        for _ in range(3):
            pred = model.predict(current_input.reshape(1, 3, 1), verbose=0)
            predictions.append(pred[0][0])
            current_input = np.append(current_input[1:], pred, axis=0)

        # Inverse transform predictions
        y_pred = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()

        result = {'ExComm Leader': leader, 'Sector': sector, 'Cloud Vendor': vendor}

        # Get actual values from test data
        actual_row = test_df[
            (test_df['ExComm Leader'] == leader) &
            (test_df['Sector'] == sector) &
            (test_df['Cloud Vendor'] == vendor)
        ]

        accuracy_values = []
        total = 0
        actual_values = []
        predicted_values = []

        for key, month, pred_val in zip(month_keys, forecast_months, y_pred):
            actual_val = actual_row.iloc[0][month] if not actual_row.empty and month in actual_row.columns else None

            result[f"actual_{key}"] = actual_val
            result[f"predicted_{key}"] = pred_val

            if pd.notna(actual_val) and pd.notna(pred_val):
                accuracy = (1 - abs(actual_val - pred_val) / actual_val) * 100
                result[f"accuracy_{key}"] = round(accuracy, 2)
                accuracy_values.append(accuracy)
                total += pred_val

                threshold = actual_val * 0.05
                actual_binary_all.append(1 if actual_val > threshold else 0)
                predicted_binary_all.append(1 if pred_val > threshold else 0)

                actual_values.append(actual_val)
                predicted_values.append(pred_val)

        result['Total'] = total
        result['model_accuracy'] = round(np.mean(accuracy_values), 2) if accuracy_values else None
        result['F1_score_overall'] = round(f1_score(actual_binary_all, predicted_binary_all), 2) if actual_binary_all else None
        forecast_results.append(result)

        # Visualization
        if actual_values and predicted_values:
            months = forecast_months
            plt.figure(figsize=(10, 6))
            plt.plot(months, actual_values, marker='o', label='Actual', color='green')
            plt.plot(months, predicted_values, marker='o', label='Predicted', color='blue')
            plt.title(f"{leader} | {sector} | {vendor}", fontsize=14)
            plt.xlabel("Month", fontsize=12)
            plt.ylabel("Cost (USD)", fontsize=12)
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.xticks(rotation=45)
            filename = f"comparison_plots_lstm/{leader}_{sector}_{vendor}_May_Jul_2025_LSTM.png".replace(" ", "_")
            plt.savefig(filename)
            plt.close()

    # Define local folder and S3 path
    local_folder = "comparison_plots_lstm"

    # Parse bucket and key prefix
    output_bucket, output_key = args.output_s3_path.replace("s3://", "").split("/", 1)

    # Walk through the folder and upload each file
    for root, _, files in os.walk(local_folder):
        for file in files:
            local_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_path, local_folder)
            s3_key = os.path.join(output_key, local_folder, relative_path).replace("\\", "/")

            # Upload file
            s3.upload_file(local_path, output_bucket, s3_key)
            print(f"Uploaded {local_path} to s3://{output_bucket}/{s3_key}")

    # Save results
    forecast_df = pd.DataFrame(forecast_results)
    # forecast_df.to_excel("Predict_group_forecast_LSTM.xlsx", index=False)
    output_buffer = BytesIO()
    forecast_df.to_excel(output_buffer, index=False, engine='openpyxl')
    output_buffer.seek(0)
    s3.upload_fileobj(output_buffer, output_bucket, output_key + "/Predict_group_forecast_LSTM.xlsx")

    # save model to s3
    @tf.function(input_signature=[tf.TensorSpec([None, X.shape[1], X.shape[2]], tf.float32)])
    def serving_fn(inputs):
        return {'outputs': model(inputs)}
    model_dir = os.environ.get('SM_MODEL_DIR', '/opt/ml/model')
    model_path = os.path.join(model_dir, '1')
    tf.saved_model.save(model, model_path, signatures={'serving_default': serving_fn})
    # tf.saved_model.save(model, model_path)

    print("checking the model signature keys")
    model_saved = tf.saved_model.load('/opt/ml/model/1')
    print(list(model_saved.signatures.keys()))
    infer = model_saved.signatures["serving_default"]
    print(infer.structured_input_signature)
    for var in infer.variables:
        print(var.name)

    # Print overall F1 score
    overall_f1 = f1_score(actual_binary_all, predicted_binary_all) if actual_binary_all else None
    print("Overall F1 Score:", round(overall_f1, 2) if overall_f1 is not None else "N/A")
    print("Forecasting for May to July 2025 using LSTM completed.")

if __name__ == "__main__":
    main()

estimator.py

import sagemaker
import os
import boto3
from sagemaker.tensorflow import TensorFlow

# Get arguments from env variables
print(os.getenv("AWS_REGION"))
region = os.getenv('AWS_REGION', 'us-east-1')
bucket = os.getenv('S3_BUCKET')

aws_session = boto3.Session(
    aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY"),
    region_name = region
)
# Create SageMaker session and get execution role
sagemaker_session = sagemaker.Session(boto_session=aws_session)
role = os.getenv("SAGEMAKER_ROLE")

# Define S3 paths
source_dir_path = f's3://{bucket}/cloud_aiops/cloud_aiops.tar.gz'
train_s3_path = f's3://{bucket}/dataset/AZURE_DATASET_TRAIN.xlsx'
test_s3_path = f's3://{bucket}/dataset/AZURE_DATASET_TEST.xlsx'
output_s3_path = f's3://{bucket}/output'

# Create the Estimator
estimator = TensorFlow(
    entry_point='model_aiops.py',               # Your training script
    source_dir=source_dir_path,           # Folder containing train.py
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='2.18',
    py_version='py310',
    hyperparameters={
        'train-s3-path': train_s3_path,
        'test-s3-path': test_s3_path,
        'output-s3-path': output_s3_path
    },
    sagemaker_session=sagemaker_session
)

# Launch the training job
estimator.fit()

# Deploy the trained model as a Sagemaker Endpoint
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name="tf-endpoint"
)

I have code block in my train.py to test the signature and variables and I got appropriate output, it is throwing error when I invoke endpoint:

    print("checking the model signature keys")
    model_saved = tf.saved_model.load('/opt/ml/model/1')
    print(list(model_saved.signatures.keys()))
    infer = model_saved.signatures["serving_default"]
    print(infer.structured_input_signature)
    for var in infer.variables:
        print(var.name)

I have tried to save the model without /1 directory but while deploying sagemaker unable to find the SavedModel. Couldn't change my training model logic so need help to figure the root cause.

1
  • I would recommend to isolate this issue between AWS / SageMaker specific issue or TensorFlow issue first, by trying to reproduce the error in different environment. Commented Oct 24 at 7:39

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.