Promises, Promises (Scraping Data with async functions)

I had to scrape a bunch of routing rules from a client’s default routing rules in Google Admin bc there’s no-way to export this list [accoridng to Google support]

Bad news: there’s no JSON api here to scrape/hack/curl

Worse news: Google’s CSRF policy is pretty damn hard to get around - so I had to write this little scrapper in vanilla JS with some promises to handle all the states

The biggest thing to note is that you have to prototype your own asyncForEach function when iterrating over an array, bc otherwise it’ll do them in any order it pleases

I needed each state to sucessfully execute and return before moving onto the next

  1. find the correct element in the dom (this is tricky bc Google uses a virtual dom with their material UI, so good luck trying to use any type of class naming standard)

  2. make sure that element is the one we want to simulate a click for (should have the text value of “edit” and an data-id attribute)

  3. simulate click and wait for the modal to render

  4. scrape the default routing rule for the internal address, map it to the external address in an array

  5. close the modal

  6. move on to the next element

  7. repeat

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

const emails = [];

Array.prototype.asyncForEach = async function (callback, thisArg) {
  thisArg = thisArg || this
  for (let i = 0, l = this.length; i !== l; ++i) {
    await callback.call(thisArg, this[i], i, this)
  }
}

function wait() {
  return new Promise(function (resolve, reject) {
    setTimeout(function () {
      console.log('waiting 1 sec')
      resolve();
    }, 1000);
  })
}

function clickElement(el) {
  return new Promise((resolve, reject) => {
    console.log('emulating click for ', el)
    el.click();
    resolve();
  })
}


function getEmails(el) {
  return new Promise((resolve, reject) => {
    console.log('getting emails for ', el)
    var search_in = document.activeElement.innerHTML;
    string_context = search_in.toString();
    array_mails = string_context.match(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi);
    let uniques = Array.from(new Set(array_mails));
    emails.push(uniques);
    resolve();
  })
}


function closeModal() {
  return new Promise((resolve, reject) => {
    const links = Array.from(document.activeElement.querySelectorAll('span'));
    for (let i = 0; i < links.length; i++) {
      if (links[i].textContent === 'Cancel') {
        console.log('closing top window')
        links[i].click();
        resolve();
        break;
      }
    }
  })
}


async function scrapePage() {
  var elements = document.querySelectorAll('td a:first-child');
  await Array.prototype.asyncForEach.call(elements, async function (el, i) {
    if ((el.innerHTML.indexOf("Edit") != -1) && (el.hasAttribute("data-id"))) {
      await clickElement(el);
      await wait();
      await getEmails(el);
      await wait();
      await closeModal();
      await wait();
    }
  });
}

scrapePage();