WEBスクレイピング

JAVASCRIPT

GASでWebスクレイピングしてみる

cheerio スクリプトID: 1ReeQ6WO8kKNxoaA_O0XEQ589cIrRvEBA9qcWpNqdOP17i47u6N9M5Xh0

function getResultData(){
  const siteUrl = 'https://www.サイト名.jp/ディレクトリ等';
  const spreadsheet = SpreadsheetApp.openById('スぺレッドシートのID');
  const sheet = spreadsheet.getSheetByName("シート1");
  const lastRow = sheet.getLastRow();
  const range = sheet.getRange(2, 2, lastRow-1);
  const values = range.getValues();
  values.forEach(function(el,i){
    const rowNum = i+2;
    const setBreadCrumb = sheet.getRange(rowNum, 7);
    const setUrl = sheet.getRange(rowNum, 8);
    const cell = sheet.getRange(rowNum, 9);
    const setResultCell = sheet.getRange(rowNum, 10);
    //setResultCell.setBackground(null);
    let setV = cell.getValue();
    let pageUrl = siteUrl + el;
    setUrl.setValue(pageUrl);

    if(!setV){
      let res = result(pageUrl, rowNum);
      let resultTxt = res.title;
      let breadCrumb = res.bread;
      let httpStatus = res.stat;
      if(httpStatus === 200){
        if(resultTxt){
          resultTxt = resultTxt.trim().replace(/\r?\n/g, '').replace(/ +/g, ' ');
          let num = resultTxt.match(/([0-9]+)件/);
          num = Number(num[1]);
          cell.setValue(resultTxt);
          setBreadCrumb.setValue(breadCrumb);
          setResultCell.setValue(num);
          if(num === 0){
            setResultCell.setBackground('red');
          }else{
            setResultCell.setBackground(null);
          }
        }else{
          cell.setValue(httpStatus);
        }
      }else{
        cell.setValue(httpStatus);
        setBreadCrumb.setValue(httpStatus);
        setResultCell.setValue(httpStatus);
      }
    }
  });
}

function result(url, index) {
  let result = {};
  let opt = {
    'muteHttpExceptions': true
  }
  let html = UrlFetchApp.fetch(url, opt).getContentText(`EUC-JP`);
  let status = UrlFetchApp.fetch(url, opt).getResponseCode();
  if(status === 200){
    let $ = Cheerio.load(html);
    result.title = $('h1.title').text();
    result.bread = $('ul.bread-crumb').text();
  }
  result.stat = status;
  console.log('row:'+index+' status:'+status+' url:'+url);
  return result;
}
Copied title and URL