From 257dd514ed7c0f6988b8a47219991cc4f61b4529 Mon Sep 17 00:00:00 2001 From: Lucian Buzzo Date: Thu, 11 Nov 2021 13:01:33 +0000 Subject: [PATCH] Improve directory scan speed prior to tarballing This changes improves the speed that the project is tarballed by switching from `klaw` to `recursive-fs` and not running `lstat` on files that are ignored. Whilst testing with the Jellyfish repository, which contains a number of sub directories, each with their own node_modules folder, I was able to reduce the time taken to scan and tarball the project from 70s to 11s, which is a massive improvement. Change-type: patch Signed-off-by: Lucian Buzzo --- lib/utils/device/deploy.ts | 3 + lib/utils/ignore.ts | 127 ++++++++++++++++++++++++++++--------- lib/utils/remote-build.ts | 8 ++- 3 files changed, 106 insertions(+), 32 deletions(-) diff --git a/lib/utils/device/deploy.ts b/lib/utils/device/deploy.ts index 526e501c..53d16732 100644 --- a/lib/utils/device/deploy.ts +++ b/lib/utils/device/deploy.ts @@ -199,14 +199,17 @@ export async function deployToDevice(opts: DeviceDeployOptions): Promise { await checkBuildSecretsRequirements(docker, opts.source); globalLogger.logDebug('Tarring all non-ignored files...'); + const tarStartTime = Date.now(); const tarStream = await tarDirectory(opts.source, { composition: project.composition, convertEol: opts.convertEol, multiDockerignore: opts.multiDockerignore, nogitignore: opts.nogitignore, // v13: delete this line }); + globalLogger.logDebug(`Tarring complete in ${Date.now() - tarStartTime} ms`); // Try to detect the device information + globalLogger.logDebug('Fetching device information...'); const deviceInfo = await api.getDeviceInformation(); let buildLogs: Dictionary | undefined; diff --git a/lib/utils/ignore.ts b/lib/utils/ignore.ts index 69dcbdad..97950602 100644 --- a/lib/utils/ignore.ts +++ b/lib/utils/ignore.ts @@ -200,6 +200,48 @@ export interface FileStats { stats: Stats; } +/** + * Create a list of files for the filesystem subtree rooted at + * projectDir, excluding entries for directories themselves. + * @param projectDir Source directory (root of subtree to be listed) + */ +async function listFiles(projectDir: string): Promise { + const dirs: string[] = []; + const files: string[] = []; + dirs.push(projectDir); + async function walk(currentDirs: string[]): Promise { + if (!currentDirs.length) { + return files; + } + + const foundDirs: string[] = []; + + // Because `currentDirs` can be of arbitrary length, process them in smaller batches + // to avoid out of memory errors. + // This approach is significantly faster than using Bluebird.map with a + // concurrency setting + const chunks = _.chunk(currentDirs, 100); + for (const chunk of chunks) { + await Promise.all( + chunk.map(async (dir) => { + const _files = await fs.readdir(dir, { withFileTypes: true }); + for (const entry of _files) { + const fpath = path.join(dir, entry.name); + if (entry.isDirectory()) { + foundDirs.push(fpath); + dirs.push(fpath); + } else { + files.push(fpath); + } + } + }), + ); + } + return walk(foundDirs); + } + return walk([projectDir]); +} + /** * Return the contents of a .dockerignore file at projectDir, as a string. * Return an empty string if a .dockerignore file does not exist. @@ -211,7 +253,7 @@ async function readDockerIgnoreFile(projectDir: string): Promise { let dockerIgnoreStr = ''; try { dockerIgnoreStr = await fs.readFile(dockerIgnorePath, 'utf8'); - } catch (err) { + } catch (err: any) { if (err.code !== 'ENOENT') { throw new ExpectedError( `Error reading file "${dockerIgnorePath}": ${err.message}`, @@ -269,7 +311,10 @@ export async function filterFilesWithDockerignore( projectDir: string, multiDockerignore: boolean, serviceDirsByService: ServiceDirs, -): Promise<{ filteredFileList: FileStats[]; dockerignoreFiles: FileStats[] }> { +): Promise<{ + filteredFileList: FileStats[]; + dockerignoreFiles: FileStats[]; +}> { // path.resolve() also converts forward slashes to backslashes on Windows projectDir = path.resolve(projectDir); const root = '.' + path.sep; @@ -294,45 +339,65 @@ export async function filterFilesWithDockerignore( const dockerignoreServiceDirs: string[] = multiDockerignore ? Object.keys(ignoreByDir).filter((dir) => dir && dir !== root) : []; + const files = await listFiles(projectDir); + const dockerignoreFiles: FileStats[] = []; const filteredFileList: FileStats[] = []; - const klaw = await import('klaw'); - await new Promise((resolve, reject) => { - // Looking at klaw's source code, `preserveSymlinks` appears to only - // afect the `stats` argument to the `data` event handler - klaw(projectDir, { preserveSymlinks: false }) - .on('error', reject) - .on('end', resolve) - .on('data', (item: { path: string; stats: Stats }) => { - const { path: filePath, stats } = item; - // With `preserveSymlinks: false`, filePath cannot be a symlink. - // filePath may be a directory or a regular or special file - if (!stats.isFile()) { - return; - } + // Because `files` can be of arbitrary length, process them in smaller batches + // to avoid out of memory errors. + // This approach is significantly faster than using Bluebird.map with a + // concurrency setting + const chunks = _.chunk(files, 750); + for (const chunk of chunks) { + await Promise.all( + chunk.map(async (filePath) => { const relPath = path.relative(projectDir, filePath); - const fileInfo = { - filePath, - relPath, - stats, - }; + + // .dockerignore files are always added to a list of known dockerignore files if (path.basename(relPath) === '.dockerignore') { - dockerignoreFiles.push(fileInfo); + const diStats = await fs.stat(filePath); + dockerignoreFiles.push({ + filePath, + relPath, + stats: diStats, + }); } - for (const dir of dockerignoreServiceDirs) { - if (relPath.startsWith(dir)) { - if (!ignoreByDir[dir].ignores(relPath.substring(dir.length))) { - filteredFileList.push(fileInfo); - } + + // First check if the file is ignored by a .dockerignore file in a service directory + const matchingDir = dockerignoreServiceDirs.find((dir) => { + return relPath.startsWith(dir); + }); + + // If the file is ignore in a service directory, exit early, otherwise check if it is ignored by the root .dockerignore file. + // Crucially, if the file is in a known service directory, and isn't ignored, the root .dockerignore file should not be checked. + if (matchingDir) { + if ( + ignoreByDir[matchingDir].ignores( + relPath.substring(matchingDir.length), + ) + ) { return; } + } else if (ignoreByDir[root].ignores(relPath)) { + return; } - if (!ignoreByDir[root].ignores(relPath)) { - filteredFileList.push(fileInfo); + + // At this point we can do a final stat of the file, and check if it should be included + const stats = await fs.stat(filePath); + + // filePath may be a special file that we should ignore, such as a socket + if (stats.isFile()) { + filteredFileList.push({ + filePath, + relPath, + stats, + }); } - }); - }); + }), + ); + } + return { filteredFileList, dockerignoreFiles }; } diff --git a/lib/utils/remote-build.ts b/lib/utils/remote-build.ts index 3d699d52..b2ab9bba 100644 --- a/lib/utils/remote-build.ts +++ b/lib/utils/remote-build.ts @@ -317,12 +317,18 @@ async function getTarStream(build: RemoteBuild): Promise { Object.keys(build.opts.registrySecrets).length > 0 ? preFinalizeCallback : undefined; - return await tarDirectory(path.resolve(build.source), { + globalLogger.logDebug('Tarring all non-ignored files...'); + const tarStartTime = Date.now(); + const tarStream = await tarDirectory(path.resolve(build.source), { preFinalizeCallback: preFinalizeCb, convertEol: build.opts.convertEol, multiDockerignore: build.opts.multiDockerignore, nogitignore: build.nogitignore, // v13: delete this line }); + globalLogger.logDebug( + `Tarring complete in ${Date.now() - tarStartTime} ms`, + ); + return tarStream; } finally { tarSpinner.stop(); }