Activity Stream
48,167 MEMBERS
61084 ONLINE
besthostingforums On YouTube Subscribe to our Newsletter besthostingforums On Twitter besthostingforums On Facebook besthostingforums On facebook groups

Results 1 to 2 of 2
  1.     
    #1
    Member
    Website's:
    litewarez.net litewarez.com triniwarez.com

    Default Ultimate PHP Scraper V1 (parse websites). V1

    Heya all,


    Heres a scraper that i created that you can use to split websites html source into different arrays

    Currently supports:


    • doctype
    • doc title
    • keywords
    • link rel
    • external css
    • h1
    • h2
    • h3
    • h4
    • h5
    • h6
    • p
    • a content
    • a href
    • a href count
    • a additionaltags
    • span
    • script
    • ul
    • li
    • comments
    • ids
    • classes
    • meta content
    • styles
    • tag titles
    • image alt
    • images
    • mailto
    • emails
    • count keywords

    HERES THE CLASS FILE

    Save this as parse.php
    PHP Code: 
    <?php
    class ParseSite{

        var 
    $DataFromSite '';

        function 
    __construct($url){
            
    $this->url $url;
            
    $this->DataFromSite $this->grab_page();
        }

        private function 
    grab_page(){
            
    $this->CurlOP = array(
                
    CURLOPT_RETURNTRANSFER    => true,    // return web page
                
    CURLOPT_HEADER        => false,    // don't return headers
                
    CURLOPT_FOLLOWLOCATION    => true,    // follow redirects
                
    CURLOPT_ENCODING    => "",        // handle all encodings
                
    CURLOPT_USERAGENT    => "LWS V1.0",    // who am i
                
    CURLOPT_AUTOREFERER    => true,    // set referer on redirect
                
    CURLOPT_CONNECTTIMEOUT    => 120,        // timeout on connect
                
    CURLOPT_TIMEOUT        => 120,        // timeout on response
                
    CURLOPT_MAXREDIRS    => 10,        // stop after 10 redirects
                
    CURLOPT_SSL_VERIFYHOST    => 0,        // don't verify ssl
                
    CURLOPT_SSL_VERIFYPEER    => false,    //
            
    );
            
    $this->ch curl_init($this->url);
            
    curl_setopt_array($this->ch,$this->CurlOP);
            
    $this->Data curl_exec($this->ch);
            
    curl_close($this->ch);
            return 
    $this->Data;
        } 
        
        function 
    get_doctype(){
            
    $h1tags preg_match('/<!DOCTYPE (\w.*)dtd">/is',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[0]);
            
    array_push($res,count($patterns[0]));
            return 
    $res;
        }

        
    // retrieve page title
        
    function get_doc_title(){
            
    $h1tags preg_match('/<title> ?.* <\/title>/isx',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[0]);
            
    array_push($res,count($patterns[0]));
            return 
    $res;
        }

        
    // retrieve keywords
        
    function get_keywords(){
            
    $h1tags preg_match('/(<meta name="keywords" content="(.*)" \/>)/i',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // get rel links in header of the site
        
    function get_link_rel(){
            
    $h1tags preg_match_all('/(rel=)(".*") href=(".*")/im',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        function 
    get_external_css(){
            
    $h1tags preg_match_all('/(href=")(\w.*\.css)"/i',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve all h1 tags
        
    function get_h1(){
            
    $h1tags preg_match_all("/(<h1.*>)(\w.*)(<\/h1>)/isxmU",$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve all h2 tags
        
    function get_h2(){
            
    $h1tags preg_match_all("/(<h2.*>)(\w.*)(<\/h2>)/isxmU",$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve all h3 tags
        
    function get_h3(){
            
    $h1tags preg_match_all("/(<h3.*>)(\w.*)(<\/h3>)/ismU",$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve all h4 tags
        
    function get_h4(){
            
    $h1tags preg_match_all("/(<h4.*>)(\w.*)(<\/h4>)/ismU",$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
        return 
    $res;
        }
        
        
    // retrieve all h5 tags
        
    function get_h5(){
            
    $h1tags preg_match_all("/(<h5.*>)(\w.*)(<\/h5>)/ismU",$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }

        
    // retrieve all h5 tags
        
    function get_h6(){
            
    $h1tags preg_match_all("/(<h6.*>)(\w.*)(<\/h6>)/ismU",$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve p tag contents
        
    function get_p(){
            
    $h1tags preg_match_all("/(<p.*>)(\w.*)(<\/p>)/ismU",$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve names of links
        
    function get_a_content(){
            
    $h1count preg_match_all("/(<a.*>)(\w.*)(<.*>)/ismU",$this->DataFromSite,$patterns);
            return 
    $patterns[2];
        }
        
        
    // retrieve link destinations
        
    function get_a_href(){
            
    $h1count preg_match_all('/(href=")(.*?)(")/i',$this->DataFromSite,$patterns);
            return 
    $patterns[2];
        }
        
        
    // get count of href's
        
    function get_a_href_count(){
            
    $h1count preg_match_all('/<(a.*) href=\"(.*?)\"(.*)<\/a>/',$this->DataFromSite,$patterns);
            return 
    count($patterns[0]);
        }
        
        
    //get all additional tags inside a link tag
        
    function get_a_additionaltags(){
            
    $h1count preg_match_all('/<(a.*) href="(.*?)"(.*)>(.*)(<\/a>)/',$this->DataFromSite,$patterns);
            return 
    $patterns[3];
        }
        
        
    // retrieve span's
        
    function get_span(){
            
    $h1count preg_match_all('/(<span .*>)(.*)(<\/span>)/',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve spans on the site
        
    function get_script(){
            
    $h1count preg_match_all('/(<script.*>)(.*)(<\/script>)/imxsU',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve content of ul's
        
    function get_ul(){
            
    $h1count preg_match_all('/(<ul \w*>)(.*)(<\/ul>)/ismxU',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    //retrieve li contents
        
    function get_li(){
            
    $h1count preg_match_all('/(<li \w*>)(.*)(<\/li>)/ismxU',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve page comments
        
    function get_comments(){
            
    $h1count preg_match_all('/(<!--).(.*)(-->)/isU',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve all used id's on the page
        
    function get_ids(){
            
    $h1count preg_match_all('/(id="(\w*)")/is',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve all used classes ( inline ) of the document
        
    function get_classes(){
            
    $h1count preg_match_all('/(class="(\w*)")/is',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // get the meta tag contents
        
    function get_meta_content(){
            
    $h1count preg_match_all('/(<meta)(.*="(.*)").\/>/ix',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // get inline styles
        
    function get_styles(){
            
    $h1count preg_match_all('/(style=")(.*?)(")/is',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // get titles of tags
        
    function get_tag_titles(){
            
    $h1count preg_match_all('/(title=)"(.*)"(.*)/',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // get image alt descriptions
        
    function get_image_alt(){
            
    $h1count preg_match_all('/(alt=.)([a-zA-Z0-9\s]{1,})/',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[2]);
            
    array_push($res,count($patterns[2]));
            return 
    $res;
        }
        
        
    // retrieve images on the site
        
    function get_images(){
            
    $h1count preg_match_all('/(<img)\s (src="([a-zA-Z0-9\.;:\/\?&=_|\r|\n]{1,})")/isxmU',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[3]);
            
    array_push($res,count($patterns[3]));
            return 
    $res;
        }
        
        
    // retrieve email address of the mailto tag if any
        
    function get_mailto(){
            
    $h1count preg_match_all('/(<a\shref=")(mailto:)([a-zA-Z@0-9\.]{1,})"/ims',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[3]);
            
    array_push($res,count($patterns[3]));
            return 
    $res;
        }
        
        
    // retrieve any email
        
    function get_emails(){
        
    $h1count preg_match_all('/[a-zA-Z0-9_-]{1,}@[a-zA-Z0-9-_]{1,}\.[a-zA-Z]{1,4}/',$this->DataFromSite,$patterns);
            
    $res = array();
            
    array_push($res,$patterns[0]);
            
    array_push($res,count($patterns[0]));
            return 
    $res;
        }

        
    // count used keywords
        
    function countkeyword($word){
            
    $x preg_match_all("/(.*)($word)(.*)/",$this->DataFromSite,$patterns);
            return 
    count($patterns);
        }

        
    // retrieve just the name without www and com/eu/de etc
        
    function get_domain_name_only(){
            
    $match preg_match("/(.*:\/\/)\w{0,}(.*)\.(.*)/",$this->url,$patterns);
            
    $patterns[2] = str_replace(".","",$patterns[2]);
            return 
    $patterns[2];
        } 
    }

    ?>
    EXAMPLE CODE:

    PHP Code: 
    <?php 

    include 'parse.php';
    $Parse = new ParseSite("http://www.phazeddl.com");
    echo 
    "<pre>";
    var_dump(
        
    $Parse->get_doctype(),
        
    $Parse->get_doc_title(),
        
    $Parse->get_keywords(),
        
    $Parse->get_link_rel(),
        
    $Parse->get_external_css(),
        
    $Parse->get_h1(),
        
    $Parse->get_h2(),
        
    $Parse->get_h3(),
        
    $Parse->get_h4(),
        
    $Parse->get_h5(),
        
    $Parse->get_h6(),
        
    $Parse->get_p(),
        
    $Parse->get_a_content(),
        
    $Parse->get_a_href(),
        
    $Parse->get_a_href_count(),
        
    $Parse->get_a_additionaltags(),
        
    $Parse->get_span(),
        
    $Parse->get_script(),
        
    $Parse->get_ul(),
        
    $Parse->get_li(),
        
    $Parse->get_comments(),
        
    $Parse->get_ids(),
        
    $Parse->get_classes(),
        
    $Parse->get_meta_content(),
        
    $Parse->get_styles(),
        
    $Parse->get_tag_titles(),
        
    $Parse->get_image_alt(),
        
    $Parse->get_images(),
        
    $Parse->get_mailto(),
        
    $Parse->get_emails(),
        
    $Parse->countkeyword("warez"),
        
    $Parse->get_domain_name_only()
    );
    echo 
    "</pre>";
    ?>
    Hope you enjoy and please comment and report bugs

    thanks
    litewarez Reviewed by litewarez on . Ultimate PHP Scraper V1 (parse websites). V1 Heya all, Heres a scraper that i created that you can use to split websites html source into different arrays Currently supports: doctype Rating: 5
    Join Litewarez.net today and become apart of the community.
    Unique | Clean | Advanced (All with you in mind)
    Downloads | Webmasters


    Notifications,Forum,Chat,Community all at Litewarez Webmasters


  2.   Sponsored Links

  3.     
    #2
    Respected Developer
    Website's:
    PlatinumW.org NexusDDL.com HD-United.org CheckLinks.org FLVD.org
    Nice work , too bulky to be used for real scraping though

Thread Information

Users Browsing this Thread

There are currently 1 users browsing this thread. (0 members and 1 guests)

Similar Threads

  1. how to parse url?
    By Cometolearn in forum Web Development Area
    Replies: 7
    Last Post: 29th Apr 2012, 09:58 AM
  2. [VPS] Ultimate Offshore Linux/Windows VPS [Host all kinds of websites]
    By ScopeHosts.Sales in forum Archive
    Replies: 10
    Last Post: 30th Jan 2012, 01:58 PM
  3. No Parse Tag
    By soft2050 in forum Feedback and Suggestions
    Replies: 0
    Last Post: 12th Nov 2011, 08:26 AM
  4. Getting a parse error (OOP) PHP
    By timtamboy63 in forum Web Development Area
    Replies: 2
    Last Post: 23rd Jul 2010, 04:11 AM

Tags for this Thread

BE SOCIAL