Revision: 65110
Updated Code
at July 1, 2014 21:57 by pantuts
Updated Code
<?php /** * AyeEmtract a class that crawls pages and * then searches for email addresses using curl in parallel. * * PHP version >= 5.x * * @category PHP * @author PANTUTS * @license http://www.gnu.org/licenses/gpl.txt * @link http://www.pantuts.com */ class AyeEmtract { // variable to store data private $_results = array(); // array of urls private $_urls = array(); // addition curl options private $_options = array(); // errors public $errors = array(); /** * Creates object and store array of urls and additional options * * @param array $urls array of urls * @param array $options addition curl options */ public function __construct($urls, $options = array()) { $this->_urls = $urls; $this->_options = $options; } /** * Sets curl_multi, curl options, and executes curl * per url. */ public function start() { // initialize curl multi handle $curlMaster = curl_multi_init(); // curl singles array $curlh = array(); // loop all urls foreach ($this->_urls as $i => $url) { // curl_init each url $curlh[$i] = curl_init(); curl_setopt_array( $curlh[$i], array( CURLOPT_URL => htmlentities(trim($url)), CURLOPT_SSL_VERIFYPEER => false, CURLOPT_USERAGENT => self::_setUserAgent(), CURLOPT_HEADER => false, CURLOPT_RETURNTRANSFER => true, CURLOPT_MAXREDIRS => 7, CURLOPT_CONNECTTIMEOUT => 20, CURLOPT_FRESH_CONNECT => true ) ); // check additional options if (!empty($this->_options)) { curl_setopt_array($curlh[$i], $this->_options); } // now add multi handle curl_multi_add_handle($curlMaster, $curlh[$i]); } // execute multi handles $running = null; do { // save errors if encountered if (curl_multi_exec($curlMaster, $running) === false) { $this->errors[$i] = 'ERROR: url = ' . curl_error($curlh[$i]) . ', code = ' . curl_errno($curlh[$i]); } } while ($running > 0); // get content on each url from curlh foreach ($curlh as $j => $ch) { // save results and remove handle $this->_results[$j] = curl_multi_getcontent($ch); curl_multi_remove_handle($curlMaster, $ch); } // close handle curl_multi_close($curlMaster); } /** * Returns array of emails * * @return array_unique(array) */ public function getEmails() { $emails = array(); // regex to find valid emails $re = "/([\s]*)([_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*([ ]+|)@([ ]+|)([a-zA-Z0-9-]+\.)+([a-zA-Z]{2,}))([\s]*)/i"; foreach ($this->_results as $i => $res) { // decode htmlentities like %3C $res = html_entity_decode($res); preg_match_all($re, $res, $matches); foreach ($matches[0] as $match) { // save to array found emails $emails[$i] = trim($match); } } // remove duplicates return array_unique($emails); } /** * Returns random user-agent * * @return $ua */ private static function _setUserAgent() { $userAgents = array( 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0', 'Opera/9.80 (J2ME/MIDP; Opera Mini/5.0 (Windows; U; Windows NT 5.1; en) AppleWebKit/886; U; en) Presto/2.4.15', 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))', 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36', 'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36', 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2' ); $ua = $userAgents[array_rand($userAgents)]; return $ua; } } ?>
Revision: 65109
Initial Code
Initial URL
Initial Description
Initial Title
Initial Tags
Initial Language
at October 25, 2013 08:36 by pantuts
Initial Code
<?php /** * AyeEmtract a class that crawls pages and * then searches for email addresses using curl in parallel. * * PHP version >= 5.x * * @category PHP * @author PANTUTS * @license http://www.gnu.org/licenses/gpl.txt * @link http://pantuts.com/2013/10/24/ayeemtract-using-curl-parallel/ */ class AyeEmtract { // variable to store data private $_results = array(); // array of urls private $_urls = array(); // addition curl options private $_options = array(); // errors public $errors = array(); /** * Creates object and store array of urls and additional options * * @param array $urls array of urls * @param array $options addition curl options */ public function __construct($urls, $options = array()) { $this->_urls = $urls; $this->_options = $options; } /** * Sets curl_multi, curl options, and executes curl * per url. */ public function start() { // initialize curl multi handle $curlMaster = curl_multi_init(); // curl singles array $curlh = array(); // loop all urls foreach ($this->_urls as $i => $url) { // curl_init each url $curlh[$i] = curl_init(); curl_setopt_array( $curlh[$i], array( CURLOPT_URL => htmlentities(trim($url)), CURLOPT_SSL_VERIFYPEER => false, CURLOPT_USERAGENT => self::_setUserAgent(), CURLOPT_HEADER => false, CURLOPT_RETURNTRANSFER => true, CURLOPT_FOLLOWLOCATION => true, CURLOPT_MAXREDIRS => 7, CURLOPT_CONNECTTIMEOUT => 20, CURLOPT_FRESH_CONNECT => true ) ); // check additional options if (!empty($this->_options)) { curl_setopt_array($curlh[$i], $this->_options); } // now add multi handle curl_multi_add_handle($curlMaster, $curlh[$i]); } // execute multi handles $running = null; do { // save errors if encountered if (curl_multi_exec($curlMaster, $running) === false) { $this->errors[$i] = 'ERROR: url = ' . curl_error($curlh[$i]) . ', code = ' . curl_errno($curlh[$i]); } } while ($running > 0); // get content on each url from curlh foreach ($curlh as $j => $ch) { // save results and remove handle $this->_results[$j] = curl_multi_getcontent($ch); curl_multi_remove_handle($curlMaster, $ch); } // close handle curl_multi_close($curlMaster); } /** * Returns array of emails * * @return array_unique(array) */ public function getEmails() { $emails = array(); // regex to find valid emails $re = "/([\s]*)([_a-zA-Z0-9-]+(\.[_a-zA-Z0-9-]+)*([ ]+|)@([ ]+|)([a-zA-Z0-9-]+\.)+([a-zA-Z]{2,}))([\s]*)/i"; foreach ($this->_results as $i => $res) { // decode htmlentities like %3C $res = html_entity_decode($res); preg_match_all($re, $res, $matches); foreach ($matches[0] as $match) { // save to array found emails $emails[$i] = trim($match); } } // remove duplicates return array_unique($emails); } /** * Returns random user-agent * * @return $ua */ private static function _setUserAgent() { $userAgents = array( 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0', 'Opera/9.80 (J2ME/MIDP; Opera Mini/5.0 (Windows; U; Windows NT 5.1; en) AppleWebKit/886; U; en) Presto/2.4.15', 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))', 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36', 'Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36', 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2' ); $ua = array_rand($userAgents); return $ua; } } ?>
Initial URL
Initial Description
<a href="http://pantuts.com/2013/10/24/ayeemtract-using-curl-parallel/">http://pantuts.com/2013/10/24/ayeemtract-using-curl-parallel/</a>
Initial Title
AyeEmtract - PHP Class that implements Curl in Parallel to extract Email Addresses
Initial Tags
email, curl, php
Initial Language
PHP