CPU Memory Leak While Inference Models in Infinite Loop

Question

I'm experiencing a CPU memory leak while running a Python script that processes text using various NLP models in an infinite loop. The script includes language translation, sentiment analysis, and topic classification. Here's a simplified version of the problematic code:

import ctranslate2 
import torch
import torch.nn.functional as F
from transformers import  AutoTokenizer, AutoModelForSequenceClassification , DistilBertForSequenceClassification, DistilBertTokenizer
import spacy
spacy.require_gpu()
ner_model = spacy.load('ner_model_path', disable=["tagger", "parser", "attribute_ruler", "lemmatizer"]) 

OTHER_LANG_DICT = {'hi': 'nllb_hi'}
LANGUAGE_MODEL = ctranslate2.Translator('nllb-200-3.3B-int8',  device="cuda")

def convertToEng(input_data):
    try:
        for k,v in input_data.items():
            text = v['text']
            lang = v['lang']
        if lang and lang == 'en':
                translated_text = text
        if lang and lang in OTHER_LANG_DICT:
            tokenizer = AutoTokenizer.from_pretrained(LANGUAGE_MODEL, src_lang=OTHER_LANG_DICT[lang])
            tokens = tokenizer.encode(text, return_tensors="pt")
            tokens_list = tokenizer.convert_ids_to_tokens(tokens[0])
            results = LANGUAGE_MODEL.translate_batch([tokens_list], target_prefix=[["eng_Latn"]])
            target = results[0].hypotheses[0][1:]
            translated_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(target), skip_special_tokens=True)
        return translated_text
    except Exception as e:
         print(str(e))

MC_TOKENIZER = AutoTokenizer.from_pretrained('bert_model_path')
MC_MODEL = AutoModelForSequenceClassification.from_pretrained('bert_model_path').to("cuda")
MC_MODEL.eval()
THRESHOLD = 0.3

MODEL_3_MODEL = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased_model_path').to(torch.device("cuda"))
MODEL_3_TOKENIZER = DistilBertTokenizer.from_pretrained('distilbert-base-uncased_model_path')


def model2_prediction(input_text):
    try:
        tokens = MC_TOKENIZER(input_text, add_special_tokens=True, return_tensors="pt", padding=True)
        tokens = {key: value.to('cuda') for key, value in tokens.items()}
        with torch.no_grad():
            logits = MC_MODEL(**tokens)[0].to('cuda')
        pred = F.softmax(logits, dim=1)
        filtered_classes = [[i for i, class_prob in enumerate(prob) if class_prob >= THRESHOLD] for prob in pred]
        return filtered_classes
    except Exception as e:
        print(str(e))

def model2_labelling(input_data: str):
    try:
        classes = []
        selected_classes = model2_prediction(input_data)
        for class_id in selected_classes[0]:
            class_name = MC_MODEL.config.id2label[class_id]
            classes.append(class_name)
        return classes
    except Exception as e:
        print(str(e))


def Model3(input_data: dict):
    try:
        id_val = input_data["id"]
        text_val = str(input_data["text"]).lower()
        tokens = MODEL_3_TOKENIZER(text_val, padding=True, truncation=True, return_tensors="pt")
        tokens = {k: v.to(torch.device("cuda")) for k, v in tokens.items()}
        with torch.no_grad():
            outputs = MODEL_3_MODEL(**tokens)
            pred = F.softmax(outputs.logits, dim=1).tolist()[0]
            pred.insert(1, pred.pop(2))
            result = {"id": id_val, "sentiment": pred}
            return result
    except Exception as e:
        print(str(e))

def text_preprocessing(text):
    pass

def ner_pred(text):
    text = text_preprocessing(text)
    doc = ner_model(text)
    entity = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entity.append(ent.text)
    return entity

def ner_result(text):
    pass # post processing of the result

def get_result(queue_name):
    try:
        data = queue_name
            
        data = convertToEng(input_data = data)
        data_text = data.get('text')
        
        if data_text:
            id = data['id']

            # sentiment = SentimentAnalysis(input_data = {"id": id, "text": ' '.join(data_text.split()[:20])})
            sentiment = Model3(input_data = {"id": id, "text": ' '.join(data_text.split()[:20])}) 
            topics = model2_labelling(input_data = data_text)
            final_result = [id,{'sentiment':sentiment['sentiment'],'topic':topics}]
        return final_result
    except Exception as e:
        print(e)


data = {"doc1":{'text': 'ram is a good boy.', 'lang': 'en'}}

if __name__ == "__main__":
    while True:
        status = get_result(queue_name=data)

I an using following library along with python version 3.10 and i have NVIDIA GeForce RTX 3070, Driver Version: 535.183.01, CUDA Version: 12.2:

nvidia-cublas-cu12 12.1.3.1
nvidia-cuda-cupti-cu12 12.1.105
nvidia-cuda-nvrtc-cu12 12.1.105
nvidia-cuda-runtime-cu12 12.1.105
nvidia-cudnn-cu12 8.9.2.26
nvidia-cufft-cu12 11.0.2.54
nvidia-curand-cu12 10.3.2.106
nvidia-cusolver-cu12 11.4.5.107
nvidia-cusparse-cu12 12.1.0.106
nvidia-nccl-cu12 2.18.1
nvidia-nvjitlink-cu12 12.5.40
nvidia-nvtx-cu12 12.1.105
torch 2.1.0
spacy 3.7.4
spacy-alignments 0.9.1
spacy-curated-transformers 0.2.2
spacy-legacy 3.0.12
spacy-loggers 1.0.5
spacy-transformers 1.3.5
accelerate 0.29.3
transformers 4.36.2

I have tried to clear the cache using gc and also i have set the environment to os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,garbage_collection_threshold:0.8' and os.environ['ONEDNN_PRIMITIVE_CACHE_CAPACITY'] = '0'

Also i have deleted all the variable used in the function after they return the result in try block. i deleted in finallyblock. BUT NO IMPROVEMENT.

hkchengrex · Accepted Answer · 2024-07-26 11:52:41Z

0

Use inference_mode():

if __name__ == "__main__":
    while True:
        with torch.inference_mode():
            status = get_result(queue_name=data)

answered Jul 26, 2024 at 11:52

hkchengrex

4,86627 silver badges36 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Amritesh Over a year ago

still memory are leaking. I implemented the new approach where after some fixed number of iteration i deleted the model object and again load it. Though it improve but still facing same issue. on further investigation i found that the spacy model not get unload from GPU after deleting the model object. What is wrong with spacy? Also i found that spacy is leaking more memory. Any idea with spacy why it is doing so.

Collectives™ on Stack Overflow

CPU Memory Leak While Inference Models in Infinite Loop

1 Answer 1

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related