0

I have a mongoDB collection with 360,000 documents and a text file with 160,000 lines with id and text separated by dash like below format:

 333-nice
 66446-bad
 88-good
 ...

I want when the id before dashes in the text file match with a field in the collection documents, Update or create a document in the new collection.

I used the following query but it is very slow and it takes a long time to do:

db.items_01.find().snapshot().forEach(function(elem)
{
  var products = cat("/Users/amirali/Desktop/kala.txt");
  var lines = products.split('\n');
  for(var i = 0;i < lines.length;i++)
  {
    var g_name = lines[i].split("-").pop();
    var pIg = g_name.replace("\"","");
    var pId = lines[i].substr(1, lines[i].indexOf('-')-1); 

    var field = elem.i_code;

    if(field.substring(0, 2) == pId)
    {
        db.items_additionals.update({
            "i_code": field
            },
            {
            $set: 
            {
                "i_code" : field,
                "class_id": field.substring(0, 2),
                "class": pIg
            }
        },{upsert:true});
    }

    if(field.substring(0, 3) == pId)
    {
        db.items_additionals.update({
            "i_code": field
            },
            {
            $set: 
            {
                "i_code" : field,
                "subclass_id": field.substring(0, 3),
                "subclass": pIg
            }
        },{upsert:true});
    }

    if(field.substring(0, 4) == pId)
    {
        db.items_additionals.update({
            "i_code": field
            },
            {
            $set: 
            {
                "i_code" : field,
                "group_id": field.substring(0, 4),
                "group": pIg
            }
        },{upsert:true});
    }

    if(field.substring(0, 5) == pId)
    {
        db.items_additionals.update({
            "i_code": field
            },
            {
            $set: 
            {
                "i_code" : field,
                "subgroup_id": field.substring(0, 5),
                "subgroup": pIg
            }
        },{upsert:true});
    }

    if(field.substring(0, 6) == pId)
    {
        db.items_additionals.update({
            "i_code": field
            },
            {
            $set: 
            {
                "i_code" : field,
                "category_id": field.substring(0, 6),
                "category": pIg
            }
        },{upsert:true});
    }

    if(field.substring(0, 7) == pId)
    {
        db.items_additionals.update({
            "i_code": field
            },
            {
            $set: 
            {
                "i_code" : field,
                "subcategory_id": field.substring(0, 7),
                "subcategory": pIg
            }
        },{upsert:true});
    }

  }
});

Notice: i_code field in documents is like 8816370532410001

How can I change this query for fast progressing?

1 Answer 1

1

The problem is that you do not use the index of your collection. Instead, you linearly go through each document and then for each document, you go linearly through the entire file, so the entire run takes 360000 times 160000 times the cost of a single document-to-line operation.

I suggest you go through the file in the outer loop, then do a lookup of the right documents (which is fast if the id is indexed). This alone should give a speedup of about six orders of magnitude. Since you also would read the file only once, it should be even faster than that (disk operations are expensive).

EDIT: I see now that you do not have a simple indexed lookup. I guess that you'll need to go through both linearly, but you should collect the contents of the file into a lookup table first. What I mean is something like this (untested):

// read it only once
var products = cat("~/foobar/kala.txt");
var lines = products.split('n');

// collect it into a map of id to words (I assume that there may be multiple
// words for one id; if that is not the case, the arrays are not needed)

var wordmap = {};

lines.forEach( function (line) {
    var parts = line.split('-');
    id = parts[0].substr(1, parts[0].length - 1);
    word = parts[1].replace('"', '');

    if (!wordmap[id]) {
        wordmap[id] = [word];
    } else {
        wordmap[id].push(word);
    }
} );

// now go through the items, lookup the possible matching IDs in the map
// created above

db.items_01.find().snapshot().forEach( function (elem) {
    var i_code = elem.i_code;
    for (length = 2; length < 8; length++) {
        var id = i_code.substring(0, length);
        var words = wordmap[id];
        if (words) {
            words.forEach( function (word) {
                db.items_additionals.update({"i_code": i_code},
                                            {$set: {"i_code": i_code,
                                                    "class_id": id,
                                                    "class": word}},
                                            {upsert: true});
            } );
        }
    }
} );

This should reduce the complexity from 360000 × 160000 to 360000 + 160000, while avoiding 359999 times reading the file.

Sign up to request clarification or add additional context in comments.

1 Comment

Can you give me an example how can i read the file only once and outer loop?

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.