GASでWebスクレイピングしてみる
cheerio スクリプトID: 1ReeQ6WO8kKNxoaA_O0XEQ589cIrRvEBA9qcWpNqdOP17i47u6N9M5Xh0
function getResultData(){
const siteUrl = 'https://www.サイト名.jp/ディレクトリ等';
const spreadsheet = SpreadsheetApp.openById('スぺレッドシートのID');
const sheet = spreadsheet.getSheetByName("シート1");
const lastRow = sheet.getLastRow();
const range = sheet.getRange(2, 2, lastRow-1);
const values = range.getValues();
values.forEach(function(el,i){
const rowNum = i+2;
const setBreadCrumb = sheet.getRange(rowNum, 7);
const setUrl = sheet.getRange(rowNum, 8);
const cell = sheet.getRange(rowNum, 9);
const setResultCell = sheet.getRange(rowNum, 10);
//setResultCell.setBackground(null);
let setV = cell.getValue();
let pageUrl = siteUrl + el;
setUrl.setValue(pageUrl);
if(!setV){
let res = result(pageUrl, rowNum);
let resultTxt = res.title;
let breadCrumb = res.bread;
let httpStatus = res.stat;
if(httpStatus === 200){
if(resultTxt){
resultTxt = resultTxt.trim().replace(/\r?\n/g, '').replace(/ +/g, ' ');
let num = resultTxt.match(/([0-9]+)件/);
num = Number(num[1]);
cell.setValue(resultTxt);
setBreadCrumb.setValue(breadCrumb);
setResultCell.setValue(num);
if(num === 0){
setResultCell.setBackground('red');
}else{
setResultCell.setBackground(null);
}
}else{
cell.setValue(httpStatus);
}
}else{
cell.setValue(httpStatus);
setBreadCrumb.setValue(httpStatus);
setResultCell.setValue(httpStatus);
}
}
});
}
function result(url, index) {
let result = {};
let opt = {
'muteHttpExceptions': true
}
let html = UrlFetchApp.fetch(url, opt).getContentText(`EUC-JP`);
let status = UrlFetchApp.fetch(url, opt).getResponseCode();
if(status === 200){
let $ = Cheerio.load(html);
result.title = $('h1.title').text();
result.bread = $('ul.bread-crumb').text();
}
result.stat = status;
console.log('row:'+index+' status:'+status+' url:'+url);
return result;
}