How to figure out your side project via crawling?
How to integrate existed project for your crawler?
Peter
Active open source contributor
An associate engineer
3+ years for PHP development
PHP 5.3 → PHP 7+
No framework→Slim→Laravel
Working for ITRI currently
Smart Grid technology
#01 '##################################################
#02 ' Set maximum web page time out
#03 SET !TIMEOUT 240
#04 ' Tell iMacros to ignore error messages
#05 SET !ERRORIGNORE YES
#06 ' Clear ALL cookies
#07 CLEAR
#08 ' Initialize Browser tab 1, close all other tabs
#09 TAB T=1
#10 TAB CLOSEALLOTHERS
#11 ' Tell iMacros to ignore images (nice if using Tor)
#12 FILTER TYPE=IMAGES STATUS=ON
#13 ' Tell iMacros to ignore extract messages
#14 SET !EXTRACT_TEST_POPUP NO
#15 '##################################################
curl -sS https://getcomposer.org/installer | php
php ~/composer.phar require guzzlehttp/guzzle:^6.2 -n
php ~/composer.phar require symfony/dom-crawler:^4.3 -n
php ~/composer.phar require symfony/css-selector:^4.3 -n
# php ~/composer.phar require fabpot/goutte:^4.0 -n
{
"content":"\n\n\t<div class=\"row listBS\">\n\t\n\t\n\t\t\n\t\t<div class=\"d-item d-title col-sm-12\">\n<div class=\"mbox\">\n\t<div class=\"d-txt\">\n <div class=\"mtitle\">\n\t\t\t\n\t\t\t<a href=\"http:\/\/aa.nttu.edu.tw\/p\/404-1002-99926-1.php\">\n\t\t\t\t\u3010\u6559\u52d9\u8655\u8ab2\u52d9\u7d44\u3011109\u5b78\u5e74\u5ea6\u7b2c1\u5b78\u671f(\u9032\u4fee\u5b78\u5236)\u9078\u8ab2\u4f5c\u696d\u6642\u7a0b(\u7db2\u8def\u52a0\u9000\u9078\u8ab2\u6642\u9593:109\u5e749\u670814\u65e5(\u4e00)08:00~9\u670818\u65e5(\u4e94)24:00)\n\t\t\t<\/a>\n\t\t\t\n\t\t\t<span class=\"subsitename newline\"><\/span>\n\t\t<\/div>\n\t<\/div>\n\t\n<\/div>\n<\/div>\n\n\t\t<\/div><div class=\"row listBS\">\n\t\n\t\t\n\t\t<div class=\"d-item d-title col-sm-12\">\n<div class=\"mbox\">\n\t<div class=\"d-txt\">\n <div class=\"mtitle\">\n\t\t\t\n\t\t\t<a href=\"http:\/\/wdsa.nttu.edu.tw\/p\/404-1009-99907-1.php\">\n\t\t\t\t\u3010\u5b78\u52d9\u8655\u6821\u5b89\u4e2d\u5fc3\u3011\u8f49\u6559\u80b2\u90e8\u8acb\u5404\u7d1a\u5b78\u6821109\u5e74\u6691\u5047\u671f\u9593\u5b78\u751f\u6d3b\u52d5\u5b89\u5168\u6ce8\u610f\u4e8b\u9805\n\t\t\t<\/a>\n\t\t\t\n\t\t\t<span class=\"subsitename newline\"><\/span>\n\t\t<\/div>\n\t<\/div>\n\t\n<\/div>\n<\/div>\n\n\t\t<\/div><div class=\"row listBS\">\n\t\n\t\t\n\t\t<div class=\"d-item d-title col-sm-12\">\n<div class=\"mbox\">\n\t<div class=\"d-txt\">\n <div class=\"mtitle\">\n\t\t\t\n\t\t\t<a href=\"http:\/\/aa.nttu.edu.tw\/p\/404-1002-99911-1.php\">\n\t\t\t\t\u3010\u6559\u52d9\u8655\u3011109\u5b78\u5e74\u5ea6\u7b2c1\u5b78\u671f\u6559\u5e2b\u6559\u5b78\u5927\u7db1\u4e0a\u50b3\u53ca\u554f\u5377\u985e\u578b\u8a2d\u5b9a\u901a\u77e5-\u81f3(109\/8\/14\u622a\u6b62)\uff0c\u8acb\u5404\u4f4d\u5e2b\u9577\u7559\u610f\uff01\n\t\t\t<\/a>\n\t\t\t\n\t\t\t<span class=\"subsitename newline\"><\/span>\n\t\t<\/div>\n\t<\/div>\n\t\n<\/div>\n<\/div>\n\n\t\t\n\t\n\t<\/div>\n\n\n\n",
"stat":"over"
}
<?php
require_once __DIR__ . '/vendor/autoload.php';
use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;
$latestNews = 'https://www.nttu.edu.tw/app/index.php?Action=mobileassocgmolist';
$client = new Client();
$formParams = [
'form_params' => [
'Cg' => '1009',
'IsTop' => '0',
'Op' => 'getpartlist',
'Page' => '1',
],
];
$response = $client->request('POST', $latestNews, $formParams);
$latestNewsString = (string)$response->getBody();
var_dump($latestNewsString);
<?php
require_once __DIR__ . '/vendor/autoload.php';
use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;
//......
$latestNewsString = json_decode($latestNewsString, true);
$crawler = new Crawler($content);
$crawler
->filter('a')
->reduce(function (Crawler $node, $i) {
global $titles;
global $links;
$titles[] = $node->text();
$links[] = $node->attr('href');
});
var_dump($links);
var_dump($titles);
array(4) {
[0]=>
string(45) "https://aa.nttu.edu.tw/p/404-1002-90907-1.php"
[1]=>
string(48) "https://enews.nttu.edu.tw/p/404-1045-90881-1.php"
[2]=>
string(48) "https://enews.nttu.edu.tw/p/404-1045-90876-1.php"
[3]=>
string(45) "https://aa.nttu.edu.tw/p/404-1002-90906-1.php"
}
array(4) {
[0]=>
string(93) "
【教務處】大一新生「運動、美術、音樂」績優獎學金申請公告
"
[1]=>
string(55) "
【秘書室】東大簡訊-13號刊(20190903)
"
[2]=>
string(75) "
【秘書室】恭賀!音樂學系何育真老師榮升副教授
"
[3]=>
string(74) "
【教務處】核發108-1舊生續領設籍臺東獎學金公告
"
}
<div>
<span id="lblMsg">The error message:</span><br />
<textarea name="txtMsg" rows="2" cols="20" id="txtMsg" class="input">
此頁面正在執行非同步回傳,
但 ScriptManager.SupportsPartialRendering 屬性卻是設定為 false。
請於非同步回傳時將此屬性設定為 true。</textarea><br />
<span id="lblStackTrace">The error stack trace:</span><br />
<textarea name="txtStackTrace" rows="2" cols="20" id="txtStackTrace" class="input">
於 System.Web.UI.ScriptManager.OnPageInitComplete(Object sender, EventArgs e)
於 System.Web.UI.Page.OnInitComplete(EventArgs e)
於 System.Web.UI.Page.ProcessRequestMain(Boolean includeStagesBeforeAsyncPoint, Boolean includeStagesAfterAsyncPoint)</textarea>
<?php
require_once __DIR__ . '/vendor/autoload.php';
use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;
$publicCourses = 'https://infosys.nttu.edu.tw/n_CourseBase_Select/CourseListPublic.aspx';
$headers = [
'Host' => 'infosys.nttu.edu.tw',
'Connection' => 'keep-alive',
'Cache-Control' => 'max-age=0',
'Upgrade-Insecure-Requests' => '1',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-User' => '?1',
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 application/signed-exchange;v=b3',
'Sec-Fetch-Site' => 'none',
'Referer' => 'https://infosys.nttu.edu.tw/',
'Accept-Encoding' => 'gzip, deflate, br',
'Accept-Language' => 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13',
];
$client = new Client(['cookies' => true]);
$response = $client->request('GET', $publicCourses, [
'debug' => true,
'headers' => $headers,
]);
......
$publicCourseString = (string)$response->getBody();
$viewState = '__VIEWSTATE';
$eventValidation = '__EVENTVALIDATION';
$viewStateGenerator = '5D156DDA';
$crawler = new Crawler($publicCourseString);
$crawler
->filter('input[type="hidden"]')
->reduce(function (Crawler $node, $i) {
global $viewState;
global $eventValidation;
if ($node->attr('name') === $viewState) {
$viewState = $node->attr('value');
}
if ($node->attr('name') === $eventValidation) {
$eventValidation = $node->attr('value');
}
});
......
$formParams = [
'form_params' => [
'ToolkitScriptManager1' => 'UpdatePanel1|Button3',
'ToolkitScriptManager1_HiddenField' => '',
'__EVENTTARGET' => '',
'__EVENTARGUMENT' => '',
'__LASTFOCUS' => '',
'__VIEWSTATE' => $viewState,
'__VIEWSTATEGENERATOR' => $viewStateGenerator,
'__SCROLLPOSITIONX' => '0',
'__SCROLLPOSITIONY' => '0',
'__VIEWSTATEENCRYPTED' => '',
'__EVENTVALIDATION' => $eventValidation,
'DropDownList1' => '1071',
'DropDownList6' => '1',
'DropDownList2' => '%',
'DropDownList3' => '%',
'DropDownList4' => '%',
'TextBox9' => '',
'DropDownList5' => '%',
'DropDownList7' => '%',
'TextBox1' => '',
'DropDownList8' => '%',
'TextBox6' => '0',
'TextBox7' => '14',
'__ASYNCPOST' => 'true',
'Button3' => '查詢',
],
ue="/wEdAAbxmE99JhLisIrrBlSpleKvA3sa9CLAiY0NRgwF9EJQGh6kvJC1EopKW4ZDfj9Gj7oGHrYxvYrs5XDlrjyz+wVULvWz/wJ+1kADwg6S0w9SXo/Fg06KOWoBIRHuyh28DoVPLgf8rKyi7Ffc8EgW/ntaNx+wYA==" />
</div>
<div>
<span id="lblMsg">The error message:</span><br />
<textarea name="txtMsg" rows="2" cols="20" id="txtMsg" class="input">
無效的 Viewstate。
Client IP: 61.230.251.119
Port: 38320
Referer:
Path: /n_CourseBase_Select/CourseListPublic.aspx
User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36
......
'headers' => [
'Sec-Fetch-Mode: cors',
'Origin: https://infosys.nttu.edu.tw',
'Accept-Encoding: gzip, deflate, br',
'Accept-Language: zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'X-Requested-With: XMLHttpRequest',
'Connection: keep-alive',
'X-MicrosoftAjax: Delta=true',
'Accept: */*',
'Cache-Control: no-cache',
'Referer: https://infosys.nttu.edu.tw/n_CourseBase_Select/CourseListPublic.aspx',
'Sec-Fetch-Site: same-origin',
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
],
];
$response = $client->request('POST', $publicCourses, $formParams);
$coursesString = (string)$response->getBody();
var_dump($coursesString);
google-chrome-stable --headless --dump-dom "https://google.com"
# Install Command Wrapper for Headless Chrome
php ~/composer.phar require chrome-php/chrome:^0.8 -n
<?php
require_once './vendor/autoload.php';
use HeadlessChromium\BrowserFactory;
$url = 'https://baabao.com/single-episode/2792254?to=1596211873940&s=8TBkr';
$jsCode = "JSON.parse(JSON.parse(localStorage.getItem('localforage/listen_history/lastListenEpisode')))";
$browserFactory = new BrowserFactory('google-chrome-stable');
// starts headless chrome
$browser = $browserFactory->createBrowser();
// creates a new page and navigate to an url
$page = $browser->createPage();
$page->navigate($url)->waitForNavigation();
// get JSON with single episode info
$episodeInfo = $page->evaluate($jsCode)->getReturnValue();
var_dump($episodeInfo);
// bye
$browser->close();
/data/badoo-episode/sample.php:20:
array(26) {
'image' =>
string(95) "https://baabao-programs-images.s3.amazonaws.com/efee0e645c6c49bc87ef7972211be7c5--1_400_400.jpg"
'episode_data_url' =>
string(168) "https://d3hl6newtgi50f.cloudfront.net/0dd31152b9db415bbae239bcba2b61ba--1125+%E5%AF%B6%E8%B2%9D%E7%89%B9%E6%B4%BE%E5%93%A1+%E7%AC%AC40%E9%9B%86+45%E5%88%86%E9%90%98.mp3"
'emojis' =>
array(1) {
[0] =>
array(2) {
'description' =>
string(12) "給個鼓勵"
'count' =>
int(0)
}
}
'subscribed' =>
......
<?php
require_once './vendor/autoload.php';
use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;
$loginUrl = 'https://www.leezen.com.tw/login.php';
$captchaUrl = 'https://www.leezen.com.tw/captcha/code.php';
$client = new Client(['cookies' => true]);
$response = $client->request('GET', $loginUrl);
$loginPageResponse = (string)$response->getBody();
$codeResponse = $client->request('GET', $captchaUrl);
// ......
file_put_contents('./code.png', (string)$codeResponse->getBody());
exec('tesseract ./code.png code');
$code = file_get_contents('./code.txt');
preg_match('/(\d+)/', $code, $matched);
$code = $matched[0];
$crawler = new Crawler($loginPageResponse);
$token = '';
$crawler
->filter('input[type="hidden"]')
->reduce(function (Crawler $node, $i) {
global $token;
if ($node->attr('name') === 'token') {
$token = $node->attr('value');
}
});
// ......
$formParams = [
'form_params' => [
'member' => 'email_or_phone_number',
'member_m' => 'email_or_phone_number',
'member_password' => 'password',
'Mode' => 'login',
'token' => $token,
'Turing2' => $code,
'login' => '登入',
],
];
// Do Login Action!
$postLoginUrl = 'https://www.leezen.com.tw/member_process.php';
$response = $client->request('POST', $postLoginUrl, $formParams);
$loginResponseString = (string)$response->getBody();
var_dump($loginResponseString);
alert("登入成功,歡迎您回到天天里仁!");window.location.replace("index.php")