0

Okay, I know this might be a weird one.

I have this block of code in GAS:

function setHeroData(ss_id, column, row, valueInputOption) {
  try {
    var sheet = SpreadsheetApp.getActiveSheet();
    let range = sheet.getName() + "!" + column + row;
    sheet.setColumnWidth(1, 50);
    let domain = "https://gamepress.gg";
    let response = UrlFetchApp.fetch(`${domain}/feheroes/heroes`);
    let page = Cheerio.load(response.getContentText());

    const heroes = new Array(page(".icon\-cell").length+1);
    const profiles = new Array(page(".icon\-cell").length);
    const vas = new Array(page(".icon\-cell").length);
    const illus = new Array(page(".icon\-cell").length);
    let profile;
    let va;
    let ill;

    heroes[0] = [];
    heroes[0].push("Icon");
    heroes[0].push("Hero");
    heroes[0].push("Hero Epithet");
    heroes[0].push("");
    heroes[0].push("Weapon");
    heroes[0].push("")
    heroes[0].push("Movement");
    heroes[0].push("Rarity");
    heroes[0].push("Origin");
    heroes[0].push("VA (EN)");
    heroes[0].push("VA (JP)");
    heroes[0].push("Illustrator");

    page(".icon\-cell").each(function (i, elem) {
      let img = domain + page(elem).find(".hero\-icon").children("img").attr("src");
      heroes[i+1] = [];
      heroes[i+1].push(`=image("${img}", 3)`);
      heroes[i+1].push(page(elem).find(".adventurer\-title").text());

      profile = domain + page(this).children("a").attr("href");
      profiles[i] = profile;

      va = page(elem).parent().attr("data-va");
      vas[i] = va;

      ill = page(elem).parent().attr("data-ill");
      illus[i] = ill;
    });

    let prof_pages = UrlFetchApp.fetchAll(profiles);

    // Get epithets from profile pages
    for (let i = 0; i<heroes.length-1; ++i) {
      let prof_page = Cheerio.load(prof_pages[i].getContentText());
      let attrib = prof_page(".vocabulary-attribute").find(".field--name-name").text();
      let attrib_img = domain + prof_page(".vocabulary-attribute").find("img").attr("src");
      let move_type = prof_page(".vocabulary-movement").find(".field--name-name").text();
      let move_type_img = domain + prof_page(".vocabulary-movement").find("img").attr("src");
      let stars = prof_page(".vocabulary-obtainable-stars").find(".field--name-name").text()[0];
      let origin = prof_page(".field--name-field-origin").text().trim();

      // Populate hero data
      heroes[i+1].push(prof_page(".field--name-title").siblings("span").text().replace(" - ", ""));
      heroes[i+1].push(`=image("${attrib_img}")`);
      heroes[i+1].push(attrib);
      heroes[i+1].push(`=image("${move_type_img}")`);
      heroes[i+1].push(move_type);
      heroes[i+1].push(`=image("https://gamepress.gg/sites/fireemblem/files/2017-06/stars${stars}.png", 3)`)
      heroes[i+1].push(origin);

      // https://stackoverflow.com/questions/36342430/get-substring-before-and-after-second-space-in-a-string-via-javascript
      // Separate the EN and JP voice actors names
      let index = vas[i].includes(".") ? vas[i].indexOf(' ', vas[i].indexOf('.') + 2) : vas[i].indexOf(' ', vas[i].indexOf(' ') + 1);
      let en_va = index >= 0 ? vas[i].substr(0, index) : vas[i].substr(index + 1);
      let jp_va = index >= 0 ? vas[i].substr(index + 1) : "";

      if (en_va.toLowerCase() === "Cassandra Lee".toLowerCase()) {
        en_va = en_va.concat(" Morris");
        jp_va = jp_va.replace("Morris", "");
        // Logger.log(en_va);
        // Logger.log(jp_va);
      }

      heroes[i+1].push(en_va.trim());
      heroes[i+1].push(jp_va.trim());
      heroes[i+1].push(illus[i]);
      Logger.log((i*100)/(heroes.length-1));
    }

    let first_col = column.charCodeAt(0) - 64;

    Sheets.Spreadsheets.Values.update({values: heroes}, ss_id, range, valueInputOption);
    sheet.autoResizeColumns(first_col, heroes[0].length).autoResizeRows(row, heroes.length);
    sheet.setRowHeights(row+1, (heroes.length-1)+row, 50);
    sheet.setColumnWidth(first_col, 50);
    sheet.setColumnWidth(first_col + 3, 30);
    sheet.setColumnWidth(first_col + 5, 30);
    sheet.setColumnWidth(first_col + 7, 100);
    sheet.setColumnWidth(first_col + 8, 319);    
  }
  catch (err) {
    Logger.log(err);
  }
}

Now, this code works fine for now, which means it populates a sheet with data scraped from a website. The problem is that there's a group of strings (VA names) that are not entirely consistent in how they're formatted. There's supposed to be two VA's, an English and a Japanese VA, and both are stored in the same html attribute (meaning they're both in the same string), but as names tend to be, there's a lot of variation. Some names are shorter, some longer; some have more words, some less and sometimes it's hard to tell when one name ends and another starts. So far, I've only managed to solve this issue for names with dots (ex: "Joe J. Thomas"), and for names like "Cassandra Lee Morris" I need to specify and edge case, which (from my understanding) is less than ideal. I have also tried to scrape a tag within the webpage that contains all names so that I can maybe validate the names, or take the names directly from that list, but I haven't had any luck.

EDIT: For example, these are strings with different sets of names:

a) "Julie Kliewer Uchida Maaya (内田真礼)"  
b) "Joe J. Thomas Suzuki Tatsuhisa (鈴木達央)"  
c) "Cassandra Lee Morris Fujiwara Natsumi (藤原夏海)"

I want to be able to extract the english names ("Julie Kliewer", "Joe J. Thomas", "Cassandra Lee Morris") and the japanese names ("Uchida Maaya (内田真礼)", "Suzuki Tatsuhisa (鈴木達央)", "Fujiwara Natsumi (藤原夏海)"), and store each set of names separate from each other.

9
  • You haven't stated what you are trying to do. You just dropped a block of code and seem to want us to figure it out. Are you trying to extract the english name from this string with both English and Japanese names in it? Commented Jul 4, 2022 at 0:29
  • Probably with the use of Regular Expressions but you are going to have to provided us with representative examples of the data. Commented Jul 4, 2022 at 0:36
  • Sorry, I get easily sidetracked. Yes, I want to extract every english name and every japanese name separately. Commented Jul 4, 2022 at 0:36
  • Without (1) a unique delimiter between the "different" substrings in the string, or (2) a canonical map of substrings to validate against, this is not solvable. Commented Jul 4, 2022 at 1:03
  • e.g. Even intuitive rules have exceptions: F. Lee Morris (initial is in first name position), Thomas Tatsuhisa (EN given name, JP surname), etc. Commented Jul 4, 2022 at 1:10

1 Answer 1

0

After looking for a while, I found this json file that contains all the information I need, and also has separate entries for the different Voice Actors of each character. This works great for my purposes, and also makes it easier to associate the correct data to each character.

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.