移除註釋的完善思路：真的能夠用正則實現？

時間 2019-11-05

原文原文鏈接

導語

網上有不少自稱能實現移除JS註釋的正則表達式，實際上存在種種缺陷。這令人多少有些愕然，也不由疑惑到：真的能夠用正則實現嗎？而本篇文章以使用正則移除JS註釋爲目標，經過實踐，由淺及深，遇到問題解決問題，一步步看看到底可否用正則實現！html

移除註釋的完善思路：真的能夠用正則實現？

1 單行註釋

單行註釋要麼佔據一整行，要麼處於某一行的最後。
正常狀況下不難，直接經過正則匹配，再用replace方法移除即可。前端

let codes = `
  let name = "Wmaker"; // This is name.
  if (name) {
    // Print name.
    console.log("His name is:", name);
  }
`;

console.log( codes.replace(/\/\/.*$/mg, '') );

// 打印出：
// let name = "Wmaker"; 
// if (name) {
//   
//   console.log("His name is:", name);
// }

上面是成功的刪除了註釋，不過對於獨佔一整行的註釋清理的不夠完全，會留下空白行。實際上，行尾註釋前面的空白也被保留了下來。因此目標稍稍提升，清除這些空白。操做起來也並不難，思路大體這樣：刪除整行，其實是刪除本行末尾的換行符或上一行末尾的換行符。而換行符自己也屬於空白符。因此只需操做正則，匹配到註釋以及註釋前面全部的空白符便可，一舉兩得。node

let codes = `
  let name = "Wmaker"; // This is name.
  if (name) {
    // Print name.
    console.log("His name is:", name);
  }
`;

console.log( codes.replace(/\s*\/\/.*$/mg, '') );

// 打印出：
// let name = "Wmaker";
// if (name) {
//   console.log("His name is:", name);
// }

若是在字符串中出現完整的URL地址，上面的正則會直接匹配而將其刪除。網上大多會將URL的格式特徵（http://xxx）：雙下劃線前面有冒號，做爲解決途徑加以利用。但這只是治標不治本的作法，畢竟//以任何形式出如今字符串中是它的自由，咱們無從干涉。 git

這樣問題就轉變成：如何使正則匹配存在於引號外的雙下劃線？
想匹配被引號包圍，帶有雙下劃線的代碼塊比較簡單：/".*\/\/.*"/mg。難點在於如何實現這個否認，即當正則匹配到雙下劃線後，再判斷其是否在引號裏面？絞盡腦汁，也上網查了不少，都沒有像樣的結果。靜心平氣，洗把臉刷刷牙再衝個頭冷靜以後，以爲單純使用正則的路已經走不通了，得跳出這個圈。 github

就在近乎精盡人亡的最後關頭，在那淫穢污濁的房間上方忽然光芒萬丈。我急忙護住了充滿血絲的眼睛，靜待其適應後定睛一看。只見那裏顯現出了一段文字（Chinese）：孩兒啊，先將帶有//被引號包圍的字符串替換掉，去掉註釋後再還原，不就好了嗎？正則表達式

let codes = `
  let name = "Wmaker"; // This is name.
  if (name) {
    // Print name.
    console.log("His name is:", name);
    console.log("Unusual situation, characters of // in quotation marks.");
  }
`;

// 以前的方式。
console.log( codes.replace(/\s*\/\/.*$/mg, '') );
// 打印出：
// let name = "Wmaker";
// if (name) {
//   console.log("His name is:", name);
//   console.log("Unusual situation, characters of
// }

// 如今的方式。
console.log( removeComments(codes) );
// 打印出：
// let name = "Wmaker";
// if (name) {
//   console.log("His name is:", name);
//   console.log("Unusual situation, characters of // in quotation marks.");
// }

function removeComments(codes) {
  let {replacedCodes, matchedObj} = replaceQuotationMarksWithForwardSlash(codes);

  replacedCodes = replacedCodes.replace(/\s*\/\/.*$/mg, '');
  Object.keys(matchedObj).forEach(k => {
    replacedCodes = replacedCodes.replace(k, matchedObj[k]);
  });

  return replacedCodes;

  function replaceQuotationMarksWithForwardSlash(codes) {
    let matchedObj = {};
    let replacedCodes = '';
    
    let regQuotation = /".*\/\/.*"/mg;
    let uniqueStr = 'QUOTATIONMARKS' + Math.floor(Math.random()*10000);

    let index = 0;
    replacedCodes = codes.replace(regQuotation, function(match) {
      let s = uniqueStr + (index++);
      matchedObj[s] = match;
      return s;
    });

    return { replacedCodes, matchedObj };
  }
}

是的，目標達成了，老天眷顧啊！
另外，有一個須要優化的地方：定義字符串的方式有三種 ' " ` ，目前咱們只匹配了雙引號。segmentfault

爲了不正則的記憶功能，都使用了正則字面量進行測試。

--- 以前
console.log( /".*\/\/.*"/mg.test(`'Unu//sual'`) ); // false
console.log( /".*\/\/.*"/mg.test(`"Unu//sual"`) ); // true
console.log( /".*\/\/.*"/mg.test(`\`Unu//sual\``) ); // false

--- 以後
console.log( /('|"|`).*\/\/.*\1/mg.test(`'Unu//sual'`) ); // true
console.log( /('|"|`).*\/\/.*\1/mg.test(`"Unu//sual"`) ); // true
console.log( /('|"|`).*\/\/.*\1/mg.test(`\`Unu//sual\``) ); // true

啊！問題到此結束了！
真的結束了嗎？不！我看了看時間：02:17，而後將眼鏡摘下，扯了張紙巾，拭去了幾顆淚水。dom

如下是接連解決的兩個問題：貪婪模式和轉義字符。測試

--- STEP 1，因爲正則的貪婪模式致使。
let codes = `
  let str = 'abc//abc'; // abc'
`;
console.log( codes.match(/('|"|`).*\/\/.*\1/mg) ); // ["'abc//abc'; // abc'"]

-- 解決

let codes = `
  let str = 'abc//abc'; // abc'
`;
console.log( codes.match(/('|"|`).*?\/\/.*?\1/mg) ); // ["'abc//abc'"]


--- STEP 2，由定義字符串時其中的轉義字符致使。
let codes = `
  let str = 'http://x\\'x.com'; // 'acs
`;
console.log( codes.match(/('|"|`).*?\/\/.*?\1/mg) ); // ["'http://x\'", "'; // '"]

-- 解決

let reg = /(?<!\\)('|"|`).*?\/\/.*?(?<!\\)\1/mg;
let codes = `
  let str = 'http://x\\'x.com'; // 'acs
`;
console.log( codes.match(reg) ); // ["'http://x\'x.com'"]

事情到這裏，雖然勞累，但多少有些成就感，畢竟成功了。優化

但是，但是，但是在測試時，居然無心間發現一個沒法逾越的障礙。就比如費勁千辛萬苦花費無盡的財力物力以後，某某尤物終於願意一同去情人旅館時，卻發現家家爆滿，沒有空餘的房間。在強裝歡笑，玩命的哄騙着她，一家接連一家的尋找直到終於定到房間後，卻發現本身已然挺不起來了！

正則會將任意位置的引號做爲查找的起始位置，它不在意引號是成雙的道理。下面是一個示例。

let reg = /(?<!\\)('|"|`).*?\/\/.*?(?<!\\)\1/mg;
let codes = `
  let str = "abc"; // "
`;
console.log( codes.match(reg) ); // [""abc"; // ""]

不過，問題好歹在補過覺以後的 06:37 時得以解決。
思路是這樣的：雖然不能正確實現匹配帶有//被引號包圍的代碼塊（可能有方法，但能力有限），可是簡化成匹配單純被引號包圍的代碼塊，是簡單並且能正確作到的，雖然耗費的內存多了一些。另外，兩引號間也可能包含換行符，因此爲其增長s模式：.表明所有字符。下面是去除單行註釋的最終代碼。

let codes = `
  let name = "Wmaker"; // This is name.
  let str = 'http://x\\'x.com' + " / / " + '/"/"/'; // '; // " "
  if (name) {
    // Print name.
    console.log("His name is:", name);
    console.log("Unusual situation, characters of // in quotation marks.");
  }
`;

console.log(removeComments(codes));
// 打印出：
// let name = "Wmaker";
// let str = 'http://x\'x.com' + " / / " + '/"/"/';
// if (name) {
//   console.log("His name is:", name);
//   console.log("Unusual situation, characters of // in quotation marks.");
// }


function removeComments(codes) {
  let {replacedCodes, matchedObj} = replaceQuotationMarksWithForwardSlash(codes);

  replacedCodes = replacedCodes.replace(/\s*\/\/.*$/mg, '');
  Object.keys(matchedObj).forEach(k => {
    replacedCodes = replacedCodes.replace(k, matchedObj[k]);
  });

  return replacedCodes;

  function replaceQuotationMarksWithForwardSlash(codes) {
    let matchedObj = {};
    let replacedCodes = '';
    
    let regQuotation = /(?<!\\)('|"|`).*?(?<!\\)\1/smg;
    let uniqueStr = 'QUOTATIONMARKS' + Math.floor(Math.random()*10000);

    let index = 0;
    replacedCodes = codes.replace(regQuotation, function(match) {
      let s = uniqueStr + (index++);
      matchedObj[s] = match;
      return s;
    });

    return { replacedCodes, matchedObj };
  }
}

最後補充一點，單雙引號雖然也能夠多行顯示，但其解析後實際是單行的。

let codes = "' \
  Wmaker \
'";
codes.match( /(?<!\\)('|"|`).*?(?<!\\)\1/smg ); // ["'   Wmaker '"]

2 多行註釋

啊！難點已經解決，如今就能夠悠哉悠哉的往前推動了。
多行註釋與單行思路相同，只需在刪除註釋時多加一個匹配模式。中和二者的最終代碼以下。

let codes = `
  let name = "Wmaker"; // This is name.
  let str = 'http://x\\'x.com' + " / / " + '/"/"/'; // '; // " "
  let str = 'http://x\\'x./*a*/com' + " / / " + '/"/"/'; // '; // "/*sad*/ "
  if (name) {
    // Print name.
    /* Print name. */
    console.log("His name is:", name);
    console.log("Unusual situation, characters of // in quotation marks.");
    /*
     * Others test.
     */
    console.log("Unusual situation, characters of /* abc */ in quotation marks.");
  }
`;

console.log(removeComments(codes));
// 打印出：
// let name = "Wmaker";
// let str = 'http://x\'x.com' + " / / " + '/"/"/';
// let str = 'http://x\'x./*a*/com' + " / / " + '/"/"/';
// if (name) {
//   console.log("His name is:", name);
//   console.log("Unusual situation, characters of // in quotation marks.");
//   console.log("Unusual situation, characters of /* abc */ in quotation marks.");
// }

function removeComments(codes) {
  let {replacedCodes, matchedObj} = replaceQuotationMarksWithForwardSlash(codes);

  replacedCodes = replacedCodes.replace(/(\s*\/\/.*$)|(\s*\/\*[\s\S]*?\*\/)/mg, '');
  Object.keys(matchedObj).forEach(k => {
    replacedCodes = replacedCodes.replace(k, matchedObj[k]);
  });

  return replacedCodes;

  function replaceQuotationMarksWithForwardSlash(codes) {
    let matchedObj = {};
    let replacedCodes = '';
    
    let regQuotation = /(?<!\\)('|"|`).*?(?<!\\)\1/smg;
    let uniqueStr = 'QUOTATIONMARKS' + Math.floor(Math.random()*10000);

    let index = 0;
    replacedCodes = codes.replace(regQuotation, function(match) {
      let s = uniqueStr + (index++);
      matchedObj[s] = match;
      return s;
    });

    return { replacedCodes, matchedObj };
  }
}

3 總結

從以上能夠得出結論，單純使用正則表達式是不能達到目標的，須要配合其它操做才行。但如今得出的結果然的能覆蓋所有的狀況？會不會有其它的隱藏問題，好比多字節字符的問題。雖然做爲一個碼農，該有的自信不會少，但慢慢的也明白了本身的侷限性。從網上的其它資料看，使用UglifyJS，或在正確的解析中去除註釋，會更爲穩妥。但有可能本身動手解決的，沒理由不花費些精力試試！

問題更新記錄
已發現，暫時不能用此思路解決問題。
感謝熱心同志找出的錯誤，我會將能改與不能改的都列於此地，並只會更新下面兩個示例的代碼。

1.沒有考慮正則字面量中的轉義字符。
出錯示例：var reg=/a\//;。
修改方式：將刪除註釋的正則改成：/(\s*(?<!\\)\/\/.*$)|(\s*(?<!\\)\/\*[\s\S]*?(?<!\\)\*\/)/mg。

2.沒法替換正則字面量。
出錯示例：var a=/abc/*123;var b=123*/123/。雖然的確是沒意義的代碼，但一無語法錯誤，二能被引擎解析。
修改方式：無，以以前的思惟暫時沒辦法。
緣由：沒法像簡單替換引號同樣，先行替換正則字面量。

3.沒法正確的移除引號塊。
出錯示例：

let codes = `
  let name = "Wmaker"; // direct\`ive of f' write as f".
  let name = "Wmaker"; // direct\`ive of f' write as f".
  let name = \`
    /* name */
  \`;
`;

修改方式：無，以以前的思惟暫時沒辦法。
緣由：'"的狀況比較好解決，可是`便可單行也可多行。

這裏是工做於前端頁面的代碼及相應示例，下載連接。

<!DOCTYPE html>
<html>

<head>
  <meta charset="UTF-8">
  <title>Remove Comments</title>
</head>

<body>
  <p>輸入：</p>
  <textarea id="input" cols="100" rows="12"></textarea>

  <br /><br />
  <button onclick="transform()">轉換</button>

  <p>輸出：</p>
  <textarea id="output" cols="100" rows="12"></textarea>
  
  <script>
    let input = document.querySelector('#input');
    let output = document.querySelector('#output');

    setDefaultValue();

    function transform() {
      output.value = removeComments(input.value);
    }

    function removeComments(codes) {
      let {replacedCodes, matchedObj} = replaceQuotationMarksWithForwardSlash(codes);

      replacedCodes = replacedCodes.replace(/(\s*(?<!\\)\/\/.*$)|(\s*(?<!\\)\/\*[\s\S]*?(?<!\\)\*\/)/mg, '');
      Object.keys(matchedObj).forEach(k => {
        replacedCodes = replacedCodes.replace(k, matchedObj[k]);
      });

      return replacedCodes;

      function replaceQuotationMarksWithForwardSlash(codes) {
        let matchedObj = {};
        let replacedCodes = '';
        
        let regQuotation = /(?<!\\)('|"|`).*?(?<!\\)\1/mg;
        let uniqueStr = 'QUOTATIONMARKS' + Math.floor(Math.random()*10000);

        let index = 0;
        replacedCodes = codes.replace(regQuotation, function(match) {
          let s = uniqueStr + (index++);
          matchedObj[s] = match;
          return s;
        });

        return { replacedCodes, matchedObj };
      }
    }

    function setDefaultValue() {
      input.value = `let name = "Wmaker"; // This is name.
let str = 'http://x\\'x.com' + " / / " + '/"/"/'; // '; // " "
let str = 'http://x\\'x./*a*/com' + " / / " + '/"/"/'; // '; // "/*sad*/ "
if (name) {
  // Print name.
  /* Print name. */
  console.log("His name is:", name);
  console.log("Unusual situation, characters of // in quotation marks.");
  /*
   * Others test.
   */
  console.log("Unusual situation, characters of /* abc */ in quotation marks.");
}
`;
    }
  </script>
</body>
</html>

這裏是工做於Node端的代碼及相應示例，下載連接。運行命令：node 執行文件待轉譯文件轉移後文件。

const fs = require('fs');
const path = require('path');
const process = require('process');


let sourceFile = process.argv[2];
let targetFile = process.argv[3];
if (!sourceFile || !targetFile) {
  throw new Error('Please set source file and target file.');
}

sourceFile = path.resolve(__dirname, sourceFile);
targetFile = path.resolve(__dirname, targetFile);

fs.readFile(sourceFile, 'utf8', (err, data) => {
  if (err) throw err;
  fs.writeFile(targetFile, removeComments(data), 'utf8', (err, data) => {
    if (err) throw err;
    console.log('Remove Comments Done!');
  });
});

function removeComments(codes) {
  let {replacedCodes, matchedObj} = replaceQuotationMarksWithForwardSlash(codes);

  replacedCodes = replacedCodes.replace(/(\s*(?<!\\)\/\/.*$)|(\s*(?<!\\)\/\*[\s\S]*?(?<!\\)\*\/)/mg, '');
  Object.keys(matchedObj).forEach(k => {
    replacedCodes = replacedCodes.replace(k, matchedObj[k]);
  });

  return replacedCodes;

  function replaceQuotationMarksWithForwardSlash(codes) {
    let matchedObj = {};
    let replacedCodes = '';
    
    let regQuotation = /(?<!\\)('|"|`).*?(?<!\\)\1/mg;
    let uniqueStr = 'QUOTATIONMARKS' + Math.floor(Math.random()*10000);

    let index = 0;
    replacedCodes = codes.replace(regQuotation, function(match) {
      let s = uniqueStr + (index++);
      matchedObj[s] = match;
      return s;
    });

    return { replacedCodes, matchedObj };
  }
}