Okay, I know this might be a weird one.
I have this block of code in GAS:
function setHeroData(ss_id, column, row, valueInputOption) {
try {
var sheet = SpreadsheetApp.getActiveSheet();
let range = sheet.getName() + "!" + column + row;
sheet.setColumnWidth(1, 50);
let domain = "https://gamepress.gg";
let response = UrlFetchApp.fetch(`${domain}/feheroes/heroes`);
let page = Cheerio.load(response.getContentText());
const heroes = new Array(page(".icon\-cell").length+1);
const profiles = new Array(page(".icon\-cell").length);
const vas = new Array(page(".icon\-cell").length);
const illus = new Array(page(".icon\-cell").length);
let profile;
let va;
let ill;
heroes[0] = [];
heroes[0].push("Icon");
heroes[0].push("Hero");
heroes[0].push("Hero Epithet");
heroes[0].push("");
heroes[0].push("Weapon");
heroes[0].push("")
heroes[0].push("Movement");
heroes[0].push("Rarity");
heroes[0].push("Origin");
heroes[0].push("VA (EN)");
heroes[0].push("VA (JP)");
heroes[0].push("Illustrator");
page(".icon\-cell").each(function (i, elem) {
let img = domain + page(elem).find(".hero\-icon").children("img").attr("src");
heroes[i+1] = [];
heroes[i+1].push(`=image("${img}", 3)`);
heroes[i+1].push(page(elem).find(".adventurer\-title").text());
profile = domain + page(this).children("a").attr("href");
profiles[i] = profile;
va = page(elem).parent().attr("data-va");
vas[i] = va;
ill = page(elem).parent().attr("data-ill");
illus[i] = ill;
});
let prof_pages = UrlFetchApp.fetchAll(profiles);
// Get epithets from profile pages
for (let i = 0; i<heroes.length-1; ++i) {
let prof_page = Cheerio.load(prof_pages[i].getContentText());
let attrib = prof_page(".vocabulary-attribute").find(".field--name-name").text();
let attrib_img = domain + prof_page(".vocabulary-attribute").find("img").attr("src");
let move_type = prof_page(".vocabulary-movement").find(".field--name-name").text();
let move_type_img = domain + prof_page(".vocabulary-movement").find("img").attr("src");
let stars = prof_page(".vocabulary-obtainable-stars").find(".field--name-name").text()[0];
let origin = prof_page(".field--name-field-origin").text().trim();
// Populate hero data
heroes[i+1].push(prof_page(".field--name-title").siblings("span").text().replace(" - ", ""));
heroes[i+1].push(`=image("${attrib_img}")`);
heroes[i+1].push(attrib);
heroes[i+1].push(`=image("${move_type_img}")`);
heroes[i+1].push(move_type);
heroes[i+1].push(`=image("https://gamepress.gg/sites/fireemblem/files/2017-06/stars${stars}.png", 3)`)
heroes[i+1].push(origin);
// https://stackoverflow.com/questions/36342430/get-substring-before-and-after-second-space-in-a-string-via-javascript
// Separate the EN and JP voice actors names
let index = vas[i].includes(".") ? vas[i].indexOf(' ', vas[i].indexOf('.') + 2) : vas[i].indexOf(' ', vas[i].indexOf(' ') + 1);
let en_va = index >= 0 ? vas[i].substr(0, index) : vas[i].substr(index + 1);
let jp_va = index >= 0 ? vas[i].substr(index + 1) : "";
if (en_va.toLowerCase() === "Cassandra Lee".toLowerCase()) {
en_va = en_va.concat(" Morris");
jp_va = jp_va.replace("Morris", "");
// Logger.log(en_va);
// Logger.log(jp_va);
}
heroes[i+1].push(en_va.trim());
heroes[i+1].push(jp_va.trim());
heroes[i+1].push(illus[i]);
Logger.log((i*100)/(heroes.length-1));
}
let first_col = column.charCodeAt(0) - 64;
Sheets.Spreadsheets.Values.update({values: heroes}, ss_id, range, valueInputOption);
sheet.autoResizeColumns(first_col, heroes[0].length).autoResizeRows(row, heroes.length);
sheet.setRowHeights(row+1, (heroes.length-1)+row, 50);
sheet.setColumnWidth(first_col, 50);
sheet.setColumnWidth(first_col + 3, 30);
sheet.setColumnWidth(first_col + 5, 30);
sheet.setColumnWidth(first_col + 7, 100);
sheet.setColumnWidth(first_col + 8, 319);
}
catch (err) {
Logger.log(err);
}
}
Now, this code works fine for now, which means it populates a sheet with data scraped from a website. The problem is that there's a group of strings (VA names) that are not entirely consistent in how they're formatted. There's supposed to be two VA's, an English and a Japanese VA, and both are stored in the same html attribute (meaning they're both in the same string), but as names tend to be, there's a lot of variation. Some names are shorter, some longer; some have more words, some less and sometimes it's hard to tell when one name ends and another starts. So far, I've only managed to solve this issue for names with dots (ex: "Joe J. Thomas"), and for names like "Cassandra Lee Morris" I need to specify and edge case, which (from my understanding) is less than ideal. I have also tried to scrape a tag within the webpage that contains all names so that I can maybe validate the names, or take the names directly from that list, but I haven't had any luck.
EDIT: For example, these are strings with different sets of names:
a) "Julie Kliewer Uchida Maaya (内田真礼)"
b) "Joe J. Thomas Suzuki Tatsuhisa (鈴木達央)"
c) "Cassandra Lee Morris Fujiwara Natsumi (藤原夏海)"
I want to be able to extract the english names ("Julie Kliewer", "Joe J. Thomas", "Cassandra Lee Morris") and the japanese names ("Uchida Maaya (内田真礼)", "Suzuki Tatsuhisa (鈴木達央)", "Fujiwara Natsumi (藤原夏海)"), and store each set of names separate from each other.
F. Lee Morris(initial is in first name position),Thomas Tatsuhisa(EN given name, JP surname), etc.