Data mining

of studydrive

"nobody wants to fill out my survey"

Situation:

{
  "id": 8,
  "name": "Johann Schreiner",
  "slug": "johann-schreiner",
  "karma": 25,
  "gender": null,
  "karma_ranking": "Mouse",
  "credits": "47872",
  "image": {
    "id": null,
    "url_big": null,
    "url_original": null,
    "url_thumbnail": null,
    "avatar": {
      "url_thumb": "https://www.studydrive.net/images/avatars/karma/thumb/mouse.png",
      "url_big": "https://www.studydrive.net/images/avatars/karma/big/mouse.png",
      "url_original": "https://www.studydrive.net/images/avatars/karma/original/mouse.png"
    }
  },
  "studies": []
}

{
  "is_owner": false,
  "downloads_generated": "15675",
  "flashcards_created": null,
  "documents_created": null,
  "posts_created": null
}

{
  "current_page": 1,
  "from": 1,
  "path": "https://www.studydrive.net/profile/8/documents",
  "per_page": 20,
  "to": 20,
  "total": 181
}

id: the userid

name: the users first and last name

slug: a computer-friendly version of the name

karma: the users karma points

karma_ranking: the users karma rank

credits: the users carma points

documents: the number of documents uploaded

downloads: the number of downloads these documents generated

const got = require('got');
const cheerio = require('cheerio');
const moment = require('moment');
const Queue = require('bull');

var queue = new Queue('crawl', 'redis://127.0.0.1:6379', { limiter: { max: 5, duration: 10 } });

let flag = false;

async function fetch(id) {
  let profile_url = `https://www.studydrive.net/en/profile/xx/${id}`;
  let stats_url = `https://www.studydrive.net/profile/${id}/stats`;
  let documents_url = `https://www.studydrive.net/profile/${id}/documents?page=1`;

  let html = await got(profile_url).then(res => res.body).catch((e) => {
    return null;
  });

  if (html == null) {
    return null;
  }

  let $ = cheerio.load(html, { normalizeWhitespace: true, decodeEntities: false });
  let now = new Date().toISOString();
  let obj = {
    created_at: now,
    id: id,
  }

  try {
    let userdata = JSON.parse($('profile-show').attr(':profile'));
    if (userdata.studies.length > 0) {
      userdata.study = userdata.studies[0];
      userdata.major = study.major;
    }
    obj = { ...obj, ...userdata }
  } catch(e) {
  }

  delete obj['image'];
  //obj['credits'] = Number(obj['credits']);

  let options = {
    headers: {
      'x-requested-with': 'XMLHttpRequest'
    }
  }

  let html2 = await got(documents_url, options).then((res) => {
    return res.body;
  }).catch((e) => {
    if (e.statusCode == '404') {
      return null;
    }
  });

  if (html2 == null) {
    return null;
  }

  let documents = JSON.parse(html2);
  obj.documents = Number(documents.meta.total) || 0;

  let html3 = await got(stats_url, options).then(res => res.body).catch((e) => {
    console.log(e);
  });
  let stats = JSON.parse(html3);
  obj.downloads = Number(stats.data.downloads_generated) || 0;

  return obj;
}

function json2csv(obj) {
  let keys = ['id', 'name', 'slug', 'karma', 'karma_ranking', 'credits', 'documents', 'downloads'];
  let tmp = [];

  if (obj) {
    for (let k of keys) {
      tmp.push(obj[k]);
    }

    return tmp.join(',');
  } else {
    return keys.join(',');
  }
}

queue.process(async function(job, done){
  let result = await fetch(job.data.id);
  if (result) {
    console.log(json2csv(result));
  }
  done();
});

async function seed() {
  console.log(json2csv());

  let start = 1134903;
  let count =  100000;

  for (let i = start, end = start + count; i < end; i++) {
    queue.add({id: i});
  }
}

seed();

27 MB csv
crawled: 1.6M

What can we say about this data?

deleted accounts

284,989 (17.81%)