Mongoimport json object with 1 document as an array of documents

80 Views Asked by At

I have alot of JSON objects which seem to be exports of collections exported as one document which contains an array of the documents in the collection.

I am having a problem importing it to Mongodb as the way I am importing it seems to be treating it as one document and exceeding the 16mb limit (some of the files are 140mb)

The structure is:

{
    "CollectionName": [
        {
            ...
        },
        ...
        {
           ...
        }
    ]
}

The sub documents in the array have a unique id as a attribute called "id", which im assuming was the original document id before being exported.

I am using PowerShell to execute mongoimport and import the collections. The code I have currently is

$collection = [regex]::Replace($_.FullName, "(?:C\:\\macship-inbound\\\d{4}\.\d{2}\.\d{2}\.\d{2}\.\d{2})", "")
        $params = '--db', 'machship',
              '--collection', "$collection",
            '--type', 'json',
            '--file', $_.FullName,
            '--batchSize', '100',
            '--numInsertionWorkers', '500',
        & "C:\MongoDBTools\bin\mongoimport.exe" @params

I have tried adding --jsonArray to the parameters but that doesn't work.

I would like to import the json using the "CollectionName" as the collection name in the database, and then the sub documents in the array as each document in the collection.

Is this possible ? Happy to use a different approach or technology, just used powershell as it is easy to add to the task scheduler on the heavily locked down machine I am using.

2

There are 2 best solutions below

2
On

Here is a very well massaged ChatGPT solution (using NodeJS) I eventually arrived at after MANY iterations of dealing with large imports hanging, slow import speed and scripts just hanging indefinitely with no apparent reason.

Hopefully this helps someone.

const fs = require('fs');
const { MongoClient } = require('mongodb');
const readline = require('readline');

// MongoDB connection URI
const mongoURI = 'mongodb://localhost:27017/dbname';

// Function to unpack and insert documents in bulk
async function unpackAndInsertBulk(jsonDocument, collectionName) {
  const arrayOfDocuments = jsonDocument[collectionName];

  const client = new MongoClient(mongoURI, { useNewUrlParser: true, useUnifiedTopology: true });

  try {
    await client.connect();
    console.log(`Connected to MongoDB. ${new Date().toISOString()}`);

    const db = client.db();
    const collection = db.collection(collectionName);

    const bulkOps = arrayOfDocuments.map(document => ({
      updateOne: {
        filter: { _id: document.Id }, // Adjust this based on your object's identifier
        update: { $set: document },
        upsert: true,
      },
    }));

    const result = await collection.bulkWrite(bulkOps, { ordered: false });

    console.log(`Bulk write result: ${JSON.stringify(result, null, 2)}`);

    if (result.upserted) {
      result.upserted.forEach((upsertedItem, index) => {
        console.log(`Document upserted into collection '${collectionName}' with identifier '${upsertedItem._id}'.`);
      });
    } else {
      console.log(`No documents upserted into collection: ${collectionName}`);
    }
  } finally {
    await client.close();
    console.log('Connection to MongoDB closed.');
  }
}

// Read the JSON file using a readable stream with chunk processing
async function readAndProcessFile(filePath) {
  console.log(`Reading and processing file: ${filePath}`);

  const readStream = fs.createReadStream(filePath, { encoding: 'utf8' });
  const rl = readline.createInterface({ input: readStream, crlfDelay: Infinity });

  let jsonString = '';
  let chunkCount = 0;

  // Process each line/chunk
  for await (const line of rl) {
    jsonString += line;
    chunkCount++;

    // Adjust the chunk size based on your needs
    if (chunkCount % 1000 === 0) {
      const jsonDocument = JSON.parse(jsonString);
      const collectionName = Object.keys(jsonDocument)[0];
      await unpackAndInsertBulk(jsonDocument, collectionName);
      jsonString = '';
    }
  }

  // Process the remaining JSON if any
  if (jsonString) {
    const jsonDocument = JSON.parse(jsonString);
    const collectionName = Object.keys(jsonDocument)[0];
    await unpackAndInsertBulk(jsonDocument, collectionName);
  }

  console.log('File read complete.');

  fs.unlink(filePath, (err) => {
    if (err) {
      console.error(`Error deleting file: ${err.message}`);
    } else {
      console.log(`File deleted: ${filePath}`);
    }
  });
}

// Function to process multiple files concurrently
async function processFiles(filePaths) {
  const promises = filePaths.map(filePath => readAndProcessFile(filePath));
  await Promise.all(promises);
}

// Get the filenames from the command-line arguments
const filenames = process.argv.slice(2);

if (filenames.length === 0) {
  console.error('Please provide at least one filename as a command-line argument.');
  process.exit(1);
}

// Call the function to process files concurrently
processFiles(filenames).catch(error => {
  console.error(`Error during processing: ${error.message}`);
});

And i run it with a powershell script which gets all the child items in the folder and pipes them to the script individually.

1
On

Your solution with node.js certainly works, however the performance might be rather poor. With jq is would be a one-liner and should be much faster:

jq '.CollectionName' "C:\macship-inbound2023-12-08-12-00" | mongoimport.exe --db=machship --collection=CollectionName --numInsertionWorkers=5

batchSize is a non documented parameter. I think the default of 1000 is fine. numInsertionWorkers of 500 seem to be exaggerated.

Update for more dynamic cases

You can get the first key name with one of these:

jq -r "keys | first" /path/to/json.json
jq -r ". =keys | first" /path/to/json.json

In cmd it is a bit difficult to get the output of command into a variable, see Assign output of a program to a variable using a MS batch file thus I would recommend the powershell:

$coll = & ./jq.exe -r "keys | first" "C:\macship-inbound2023-12-08-12-00"
./jq.exe "first(.[])" "C:\macship-inbound2023-12-08-12-00" | mongoimport.exe --collection=$coll ...