2020-01-29 11:10:59 +00:00
|
|
|
/**
|
|
|
|
* @license
|
|
|
|
* Copyright 2019-2020 Balena Ltd.
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2020-02-11 18:07:25 +00:00
|
|
|
import { fs } from 'mz';
|
|
|
|
import Logger = require('./logger');
|
|
|
|
|
|
|
|
const globalLogger = Logger.getLogger();
|
|
|
|
|
2020-01-29 11:10:59 +00:00
|
|
|
// Define file size threshold (bytes) over which analysis/conversion is not performed.
|
|
|
|
const LARGE_FILE_THRESHOLD = 10 * 1000 * 1000;
|
|
|
|
|
2020-02-11 18:07:25 +00:00
|
|
|
// Note that `convertEolInPlace()` only works with UTF-8 or single-byte encodings
|
2020-01-29 11:10:59 +00:00
|
|
|
const CONVERTIBLE_ENCODINGS = ['ascii', 'utf-8'];
|
|
|
|
|
2020-02-11 18:07:25 +00:00
|
|
|
// Maximum number of bytes to consider when detecting the file encoding
|
|
|
|
const DETECT_MAX_BYTES = 1024;
|
2020-01-29 11:10:59 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Convert EOL (CRLF → LF) in place, i.e. modifying the input buffer.
|
|
|
|
* Safe for UTF-8, ASCII and 8-bit encodings (like 'latin-1', 'iso-8859-1', ...),
|
|
|
|
* but not safe for UTF-16 or UTF-32.
|
|
|
|
* Return a new buffer object sharing the same contents memory space as the
|
|
|
|
* input buffer (using Buffer.slice()), in order to safely reflect the new
|
|
|
|
* buffer size.
|
|
|
|
* @param buf
|
|
|
|
*/
|
2020-02-01 00:29:51 +00:00
|
|
|
export function convertEolInPlace(buf: Buffer): Buffer {
|
2020-01-29 11:10:59 +00:00
|
|
|
const CR = 13;
|
|
|
|
const LF = 10;
|
|
|
|
let foundCR = false;
|
|
|
|
let j;
|
|
|
|
// Algorithm gist:
|
|
|
|
// - i and j are running indexes over the same buffer, but think of it as
|
|
|
|
// i pointing to the input buffer, and j pointing to the output buffer.
|
|
|
|
// - i and j are incremented by 1 in every loop iteration, but if a LF is found
|
|
|
|
// after a CR, then j is decremented by 1, and LF is written. Invariant: j <= i.
|
|
|
|
for (let i = (j = 0); i < buf.length; i++, j++) {
|
|
|
|
const b = (buf[j] = buf[i]);
|
|
|
|
if (b === CR) {
|
|
|
|
foundCR = true;
|
|
|
|
} else {
|
|
|
|
if (foundCR && b === LF) {
|
|
|
|
j--; // decrement index of "output buffer"
|
|
|
|
buf[j] = LF; // overwrite previous CR with LF
|
|
|
|
}
|
|
|
|
foundCR = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return buf.slice(0, j);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Drop-in replacement for promisified fs.readFile(<string>)
|
|
|
|
* Attempts to convert EOLs from CRLF to LF for supported encodings,
|
|
|
|
* or otherwise logs warnings.
|
|
|
|
* @param filepath
|
|
|
|
* @param convertEol When true, performs conversions, otherwise just warns.
|
|
|
|
*/
|
|
|
|
export async function readFileWithEolConversion(
|
|
|
|
filepath: string,
|
|
|
|
convertEol: boolean,
|
|
|
|
): Promise<Buffer> {
|
|
|
|
const fileBuffer = await fs.readFile(filepath);
|
2020-02-05 22:26:55 +00:00
|
|
|
|
2020-01-29 11:10:59 +00:00
|
|
|
// Skip processing of very large files
|
|
|
|
const fileStats = await fs.stat(filepath);
|
|
|
|
if (fileStats.size > LARGE_FILE_THRESHOLD) {
|
|
|
|
globalLogger.logWarn(`CRLF detection skipped for large file: ${filepath}`);
|
|
|
|
return fileBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Analyse encoding
|
|
|
|
const encoding = await detectEncoding(fileBuffer);
|
|
|
|
|
|
|
|
// Skip further processing of non-convertible encodings
|
|
|
|
if (!CONVERTIBLE_ENCODINGS.includes(encoding)) {
|
|
|
|
return fileBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Skip further processing of files that don't contain CRLF
|
2020-02-11 18:07:25 +00:00
|
|
|
if (!fileBuffer.includes('\r\n')) {
|
2020-01-29 11:10:59 +00:00
|
|
|
return fileBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (convertEol) {
|
|
|
|
// Convert CRLF->LF
|
|
|
|
globalLogger.logInfo(
|
|
|
|
`Converting line endings CRLF -> LF for file: ${filepath}`,
|
|
|
|
);
|
|
|
|
|
|
|
|
return convertEolInPlace(fileBuffer);
|
|
|
|
} else {
|
|
|
|
// Immediate warning
|
|
|
|
globalLogger.logWarn(
|
|
|
|
`CRLF (Windows) line endings detected in file: ${filepath}`,
|
|
|
|
);
|
|
|
|
// And summary warning later
|
|
|
|
globalLogger.deferredLog(
|
|
|
|
'Windows-format line endings were detected in some files. Consider using the `--convert-eol` option.',
|
|
|
|
Logger.Level.WARN,
|
|
|
|
);
|
|
|
|
|
|
|
|
return fileBuffer;
|
|
|
|
}
|
|
|
|
}
|
2020-02-11 18:07:25 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Attempt to detect the encoding of a data buffer.
|
|
|
|
* Code copied and modified from the npm package 'isbinaryfile' (MIT licence)
|
|
|
|
* https://github.com/gjtorikian/isBinaryFile/blob/master/src/index.ts
|
|
|
|
*
|
|
|
|
* @returns one of the possible values: '' (empty file), 'utf-8', 'utf-16',
|
|
|
|
* 'utf-32', 'gb-18030', 'pdf', and 'binary'.
|
|
|
|
*
|
|
|
|
* Note: pure ASCII data is identified as 'utf-8' (ASCII is indeed a subset
|
|
|
|
* of UTF-8).
|
|
|
|
*
|
|
|
|
* @param fileBuffer File contents whose encoding should be detected
|
|
|
|
* @param bytesRead Optional "file size" if smaller than the buffer size
|
|
|
|
*/
|
|
|
|
export async function detectEncoding(
|
|
|
|
fileBuffer: Buffer,
|
|
|
|
bytesRead = fileBuffer.length,
|
|
|
|
): Promise<string> {
|
|
|
|
// empty file
|
|
|
|
if (bytesRead === 0) {
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
|
|
|
|
const totalBytes = Math.min(bytesRead, DETECT_MAX_BYTES);
|
|
|
|
|
|
|
|
// UTF-8 BOM
|
|
|
|
if (
|
|
|
|
bytesRead >= 3 &&
|
|
|
|
fileBuffer[0] === 0xef &&
|
|
|
|
fileBuffer[1] === 0xbb &&
|
|
|
|
fileBuffer[2] === 0xbf
|
|
|
|
) {
|
|
|
|
return 'utf-8';
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF-32 BOM
|
|
|
|
if (
|
|
|
|
bytesRead >= 4 &&
|
|
|
|
fileBuffer[0] === 0x00 &&
|
|
|
|
fileBuffer[1] === 0x00 &&
|
|
|
|
fileBuffer[2] === 0xfe &&
|
|
|
|
fileBuffer[3] === 0xff
|
|
|
|
) {
|
|
|
|
return 'utf-32';
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF-32 LE BOM
|
|
|
|
if (
|
|
|
|
bytesRead >= 4 &&
|
|
|
|
fileBuffer[0] === 0xff &&
|
|
|
|
fileBuffer[1] === 0xfe &&
|
|
|
|
fileBuffer[2] === 0x00 &&
|
|
|
|
fileBuffer[3] === 0x00
|
|
|
|
) {
|
|
|
|
return 'utf-32';
|
|
|
|
}
|
|
|
|
|
|
|
|
// GB BOM (https://en.wikipedia.org/wiki/GB_18030)
|
|
|
|
if (
|
|
|
|
bytesRead >= 4 &&
|
|
|
|
fileBuffer[0] === 0x84 &&
|
|
|
|
fileBuffer[1] === 0x31 &&
|
|
|
|
fileBuffer[2] === 0x95 &&
|
|
|
|
fileBuffer[3] === 0x33
|
|
|
|
) {
|
|
|
|
return 'gb-18030';
|
|
|
|
}
|
|
|
|
|
|
|
|
if (totalBytes >= 5 && fileBuffer.slice(0, 5).toString() === '%PDF-') {
|
|
|
|
/* PDF. This is binary. */
|
|
|
|
return 'pdf';
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF-16 BE BOM
|
|
|
|
if (bytesRead >= 2 && fileBuffer[0] === 0xfe && fileBuffer[1] === 0xff) {
|
|
|
|
return 'utf-16';
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF-16 LE BOM
|
|
|
|
if (bytesRead >= 2 && fileBuffer[0] === 0xff && fileBuffer[1] === 0xfe) {
|
|
|
|
return 'utf-16';
|
|
|
|
}
|
|
|
|
|
|
|
|
for (let i = 0; i < totalBytes; i++) {
|
|
|
|
let c = fileBuffer[i];
|
|
|
|
if (c === 0) {
|
|
|
|
// NULL byte
|
|
|
|
return 'binary';
|
|
|
|
} else if (c === 27) {
|
|
|
|
// ESC character used in ANSI escape sequences for text color (log files)
|
|
|
|
continue;
|
|
|
|
} else if ((c < 7 || c > 14) && (c < 32 || c > 127)) {
|
|
|
|
// UTF-8 detection
|
|
|
|
if (c > 193 && c < 224 && i + 1 < totalBytes) {
|
|
|
|
i++;
|
|
|
|
c = fileBuffer[i];
|
|
|
|
if (c > 127 && c < 192) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else if (c > 223 && c < 240 && i + 2 < totalBytes) {
|
|
|
|
i++;
|
|
|
|
c = fileBuffer[i];
|
|
|
|
if (
|
|
|
|
c > 127 &&
|
|
|
|
c < 192 &&
|
|
|
|
fileBuffer[i + 1] > 127 &&
|
|
|
|
fileBuffer[i + 1] < 192
|
|
|
|
) {
|
|
|
|
i++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 'binary';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 'utf-8';
|
|
|
|
}
|