Results 1 to 2 of 2
-
15th May 2009, 12:02 AM #1OPMemberWebsite's:
litewarez.net litewarez.com triniwarez.comUltimate PHP Scraper V1 (parse websites). V1
Heya all,
Heres a scraper that i created that you can use to split websites html source into different arrays
Currently supports:
- doctype
- doc title
- keywords
- link rel
- external css
- h1
- h2
- h3
- h4
- h5
- h6
- p
- a content
- a href
- a href count
- a additionaltags
- span
- script
- ul
- li
- comments
- ids
- classes
- meta content
- styles
- tag titles
- image alt
- images
- mailto
- emails
- count keywords
HERES THE CLASS FILE
Save this as parse.php
PHP Code:<?php
class ParseSite{
var $DataFromSite = '';
function __construct($url){
$this->url = $url;
$this->DataFromSite = $this->grab_page();
}
private function grab_page(){
$this->CurlOP = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "LWS V1.0", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
CURLOPT_SSL_VERIFYHOST => 0, // don't verify ssl
CURLOPT_SSL_VERIFYPEER => false, //
);
$this->ch = curl_init($this->url);
curl_setopt_array($this->ch,$this->CurlOP);
$this->Data = curl_exec($this->ch);
curl_close($this->ch);
return $this->Data;
}
function get_doctype(){
$h1tags = preg_match('/<!DOCTYPE (\w.*)dtd">/is',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[0]);
array_push($res,count($patterns[0]));
return $res;
}
// retrieve page title
function get_doc_title(){
$h1tags = preg_match('/<title> ?.* <\/title>/isx',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[0]);
array_push($res,count($patterns[0]));
return $res;
}
// retrieve keywords
function get_keywords(){
$h1tags = preg_match('/(<meta name="keywords" content="(.*)" \/>)/i',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get rel links in header of the site
function get_link_rel(){
$h1tags = preg_match_all('/(rel=)(".*") href=(".*")/im',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns);
array_push($res,count($patterns[2]));
return $res;
}
function get_external_css(){
$h1tags = preg_match_all('/(href=")(\w.*\.css)"/i',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h1 tags
function get_h1(){
$h1tags = preg_match_all("/(<h1.*>)(\w.*)(<\/h1>)/isxmU",$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h2 tags
function get_h2(){
$h1tags = preg_match_all("/(<h2.*>)(\w.*)(<\/h2>)/isxmU",$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h3 tags
function get_h3(){
$h1tags = preg_match_all("/(<h3.*>)(\w.*)(<\/h3>)/ismU",$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h4 tags
function get_h4(){
$h1tags = preg_match_all("/(<h4.*>)(\w.*)(<\/h4>)/ismU",$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h5 tags
function get_h5(){
$h1tags = preg_match_all("/(<h5.*>)(\w.*)(<\/h5>)/ismU",$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all h5 tags
function get_h6(){
$h1tags = preg_match_all("/(<h6.*>)(\w.*)(<\/h6>)/ismU",$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve p tag contents
function get_p(){
$h1tags = preg_match_all("/(<p.*>)(\w.*)(<\/p>)/ismU",$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve names of links
function get_a_content(){
$h1count = preg_match_all("/(<a.*>)(\w.*)(<.*>)/ismU",$this->DataFromSite,$patterns);
return $patterns[2];
}
// retrieve link destinations
function get_a_href(){
$h1count = preg_match_all('/(href=")(.*?)(")/i',$this->DataFromSite,$patterns);
return $patterns[2];
}
// get count of href's
function get_a_href_count(){
$h1count = preg_match_all('/<(a.*) href=\"(.*?)\"(.*)<\/a>/',$this->DataFromSite,$patterns);
return count($patterns[0]);
}
//get all additional tags inside a link tag
function get_a_additionaltags(){
$h1count = preg_match_all('/<(a.*) href="(.*?)"(.*)>(.*)(<\/a>)/',$this->DataFromSite,$patterns);
return $patterns[3];
}
// retrieve span's
function get_span(){
$h1count = preg_match_all('/(<span .*>)(.*)(<\/span>)/',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve spans on the site
function get_script(){
$h1count = preg_match_all('/(<script.*>)(.*)(<\/script>)/imxsU',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve content of ul's
function get_ul(){
$h1count = preg_match_all('/(<ul \w*>)(.*)(<\/ul>)/ismxU',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
//retrieve li contents
function get_li(){
$h1count = preg_match_all('/(<li \w*>)(.*)(<\/li>)/ismxU',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve page comments
function get_comments(){
$h1count = preg_match_all('/(<!--).(.*)(-->)/isU',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all used id's on the page
function get_ids(){
$h1count = preg_match_all('/(id="(\w*)")/is',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve all used classes ( inline ) of the document
function get_classes(){
$h1count = preg_match_all('/(class="(\w*)")/is',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get the meta tag contents
function get_meta_content(){
$h1count = preg_match_all('/(<meta)(.*="(.*)").\/>/ix',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get inline styles
function get_styles(){
$h1count = preg_match_all('/(style=")(.*?)(")/is',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get titles of tags
function get_tag_titles(){
$h1count = preg_match_all('/(title=)"(.*)"(.*)/',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// get image alt descriptions
function get_image_alt(){
$h1count = preg_match_all('/(alt=.)([a-zA-Z0-9\s]{1,})/',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[2]);
array_push($res,count($patterns[2]));
return $res;
}
// retrieve images on the site
function get_images(){
$h1count = preg_match_all('/(<img)\s (src="([a-zA-Z0-9\.;:\/\?&=_|\r|\n]{1,})")/isxmU',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[3]);
array_push($res,count($patterns[3]));
return $res;
}
// retrieve email address of the mailto tag if any
function get_mailto(){
$h1count = preg_match_all('/(<a\shref=")(mailto:)([a-zA-Z@0-9\.]{1,})"/ims',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[3]);
array_push($res,count($patterns[3]));
return $res;
}
// retrieve any email
function get_emails(){
$h1count = preg_match_all('/[a-zA-Z0-9_-]{1,}@[a-zA-Z0-9-_]{1,}\.[a-zA-Z]{1,4}/',$this->DataFromSite,$patterns);
$res = array();
array_push($res,$patterns[0]);
array_push($res,count($patterns[0]));
return $res;
}
// count used keywords
function countkeyword($word){
$x = preg_match_all("/(.*)($word)(.*)/",$this->DataFromSite,$patterns);
return count($patterns);
}
// retrieve just the name without www and com/eu/de etc
function get_domain_name_only(){
$match = preg_match("/(.*:\/\/)\w{0,}(.*)\.(.*)/",$this->url,$patterns);
$patterns[2] = str_replace(".","",$patterns[2]);
return $patterns[2];
}
}
?>
PHP Code:<?php
include 'parse.php';
$Parse = new ParseSite("http://www.phazeddl.com");
echo "<pre>";
var_dump(
$Parse->get_doctype(),
$Parse->get_doc_title(),
$Parse->get_keywords(),
$Parse->get_link_rel(),
$Parse->get_external_css(),
$Parse->get_h1(),
$Parse->get_h2(),
$Parse->get_h3(),
$Parse->get_h4(),
$Parse->get_h5(),
$Parse->get_h6(),
$Parse->get_p(),
$Parse->get_a_content(),
$Parse->get_a_href(),
$Parse->get_a_href_count(),
$Parse->get_a_additionaltags(),
$Parse->get_span(),
$Parse->get_script(),
$Parse->get_ul(),
$Parse->get_li(),
$Parse->get_comments(),
$Parse->get_ids(),
$Parse->get_classes(),
$Parse->get_meta_content(),
$Parse->get_styles(),
$Parse->get_tag_titles(),
$Parse->get_image_alt(),
$Parse->get_images(),
$Parse->get_mailto(),
$Parse->get_emails(),
$Parse->countkeyword("warez"),
$Parse->get_domain_name_only()
);
echo "</pre>";
?>
thankslitewarez Reviewed by litewarez on . Ultimate PHP Scraper V1 (parse websites). V1 Heya all, Heres a scraper that i created that you can use to split websites html source into different arrays Currently supports: doctype Rating: 5Join Litewarez.net today and become apart of the community.
Unique | Clean | Advanced (All with you in mind)
Downloads | Webmasters
Notifications,Forum,Chat,Community all at Litewarez Webmasters
-
15th May 2009, 05:50 AM #2Respected DeveloperWebsite's:
PlatinumW.org NexusDDL.com HD-United.org CheckLinks.org FLVD.orgNice work , too bulky to be used for real scraping though
Current projects:
Megaupload Premium Multifetch Script | FF Plugin: Tinypic and Imagevenue Image Remoter
Projects in hiatus:
IPB Linkchecker Bot | VB Linkchecker Bot
Sponsored Links
Thread Information
Users Browsing this Thread
There are currently 1 users browsing this thread. (0 members and 1 guests)
Similar Threads
-
how to parse url?
By Cometolearn in forum Web Development AreaReplies: 7Last Post: 29th Apr 2012, 09:58 AM -
[VPS] Ultimate Offshore Linux/Windows VPS [Host all kinds of websites]
By ScopeHosts.Sales in forum ArchiveReplies: 10Last Post: 30th Jan 2012, 01:58 PM -
No Parse Tag
By soft2050 in forum Feedback and SuggestionsReplies: 0Last Post: 12th Nov 2011, 08:26 AM -
Getting a parse error (OOP) PHP
By timtamboy63 in forum Web Development AreaReplies: 2Last Post: 23rd Jul 2010, 04:11 AM
themaPoster - post to forums and...
Version 5.19 released. Open older version (or...