0

Web サイトの DOM 全体を保存し、css 計算スタイルの取得などの操作を行う必要がある小さなプロジェクトがあります (これは jquery で行います)。

現時点では、cURL 経由で HTML マークアップを取得する関数をいくつか作成しています。このDOMを多次元配列に保存する方法はありますか? 再帰的に?

現在、このhttps://code.google.com/p/php-html2array/(バージョン1.01を使用しています)を使用していますが、何らかの理由でメインの「body」タグなどの要素が欠落しています。重要な要素。URL im テストは次のとおりです。http://www.vulytrampolines.com/

これを行う方法、またはそのGoogleコードを使用してphpを編集してbodyタグを表示する方法を教えてください。

これまでの私のコード(Pastebin):

<?php
/**
* Website Layout Checker
*
* @package  
* @author Marais Rossouw (marais.r@vulytrampolines.com)
* @copyright Vuly
* @version 2013
* @access public
*/

require_once '../setup.php';
ini_set('max_execution_time', 6000);

class layout {

        private $_LAYOUT, $_URL, $_DOC, $_LAYOUT_ARRAY, $_SAVE_TO_JSON, $_SAVE_TO_HTML, $_HTML_BODY;

        private $_CONSOLE = array();

        public function __construct($url) {

                // Get's the contents of the page specified.
                try {
                        $client = new Zend_Http_Client;
                        $client->setUri($url);
                        $client->setConfig(array('strictredirects' => true, 'maxredirects' => 10, 'timeout' => 8));
                        $response = $client->request();

                        $this->_LAYOUT = $response->getBody();
                        $this->_URL = $url;
                } catch (Exception $e) {
                        $this->consoleLog($e);
                }

                // Creates a DOMDocument
                try {
                        $this->_INIT();
                } catch (Exception $e) {
                        $this->consoleLog($e);
                }

                // Save the files
                try {
                        file_put_contents($this->_SAVE_TO_JSON, json_encode($this->_LAYOUT_ARRAY));
                        file_put_contents($this->_SAVE_TO_HTML, $this->_LAYOUT);

                        $this->consoleLog("The JSON file was saved to: " . $this->_SAVE_TO_JSON);
                        $this->consoleLog("The HTML file was saved to: " . $this->_SAVE_TO_HTML);
                } catch (Exception $e) {
                        $this->consoleLog($e);
                }
        }

        private function _INIT() {
                $doc = new DOMDocument();

                libxml_use_internal_errors(true);

                $doc->loadHTML($this->_LAYOUT);
                $this->_DOC = new DOMXpath($doc);

                $this->consoleLog("DOMDocument created");

                $parser = new htmlParser($this->_LAYOUT);
                $this->_LAYOUT_ARRAY = $parser->toArray();


                //var_dump($this->_LAYOUT_ARRAY[0]['childNodes']);exit;

                $this->consoleLog("Dom array created");

                $this->consoleLog("There are " . count($this->_LAYOUT_ARRAY, COUNT_RECURSIVE) . " elements in the dom array");

                $this->_FILE_NAME = "VULY_LAYOUT_CHECKER-" . sha1(htmlspecialchars(trim($this->_URL)) . date("Ymd") . rand(99, 9999));
                $this->_FILE_PATH = "layout_checker\\"/*sys_get_temp_dir() . "\\"*/;

                $this->_SAVE_TO_JSON = $this->_FILE_PATH . $this->_FILE_NAME . ".txt";
                $this->_SAVE_TO_HTML = $this->_FILE_PATH . $this->_FILE_NAME . ".html";


                libxml_use_internal_errors(false);
        }

        public function toString() {
                return $this->_LAYOUT;
        }

        public function getBody() {
                $this->recurse($this->_LAYOUT_ARRAY);
                return $this->_HTML_BODY;
        }

        private function recurse($file) {
                if ($this->_HTML_BODY != false) { return; }

                for ($i = 0; $i < count($file); $i++) {
                        if ($file[$i]['childNodes']) {
                                if ($file[$i]['tag'] == "body") {
                                        $this->_HTML_BODY = $file[$i]['innerHTML'];
                                        return;
                                } else {
                                        $this->recurse($file[$i]['childNodes']);
                                }
                        }
                }
        }

        public function getJSON_FILE() {
                return file_get_contents($this->_SAVE_TO_JSON);
        }

        public function get_SAVE_TO_HTML() {
                return $this->_SAVE_TO_HTML;
        }


        public function consoleLog($string) {
                $this->_CONSOLE[] = $string;
        }

        public function renderConsole() {
                $return = "";

                $_PAD_SIZE = strlen(count($this->_CONSOLE)) + 2;

                foreach ($this->_CONSOLE as $key => $value) {
                        $return .= str_pad($key . ":", $_PAD_SIZE) . $value . "\n";
                }
                return $return;
        }

}

class htmlParser {

        //your very own separator
        //do not enter characters such as < or >
        private $separator = '~';
        //the tags that don't have any innerHTML in them
        //feel free to add some if I missed any
        private $singleTags = 'meta|img|hr|br|link|!--|!DOCTYPE|input';

        //-- Don't edit below this --

        private $html,$level;
        public $levelArray;

        function __construct($html='') {
                $this->html=$this->removeWhiteSpace($html);
                $this->level=-1;
                $this->levelArray=array();
        }
        function __destruct() {
                //nothing yet;
        }
        private function getElement($value) {
                $ar = explode($this->separator,$value);
                $ar = explode('-',$ar[1]);
                return $this->levelArray[$ar[0]][$ar[1]];
        }
        private function parseToHTML($str,$level) {
                $ar=$this->getArrayOfReplacements($str);
                foreach ($ar as $item) {
                        $elem = $this->getElement($item);
                        $str=str_replace($item,($level==0?$elem['htmlText']:'<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>'),$str);
                }
                return $str;
        }
        private function replaceSingleTags() {
                //tags like img, input etc
                $result=preg_match_all('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html, $m);
                if ($result>0) {
                        foreach ($m[0] as $id => $value) {
                                $this->html = str_replace($value,'',$this->html);
                        }
                }
        }
        private function replaceSimpleTags() {
                //tags that only have text in them (no other content)
                $result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $m);
                if ($result>0) {
                        $this->level++;
                        $oneLevel=array();
                        foreach ($m[0] as $id => $value) {
                                if ($this->level==0) $htmlText=$value;
                                else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);

                                $oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);

                                $this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
                        }
                        $this->levelArray [$this->level] = $oneLevel;
                }
        }
        private function replaceRemainingTags() {
                //tags that remain after everything
                $result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $m);
                if ($result>0) {
                        $this->level++;
                        $oneLevel=array();
                        foreach ($m[0] as $id => $value) {
                                if ($this->level==0) $htmlText=$m[3][$id];
                                else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);

                                $oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);

                                $this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
                        }
                        $this->levelArray [$this->level] = $oneLevel;
                }
        }
        private function existSimpleTags() {
                $result=preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html);
                return $result>0;
        }
        private function existSingleTags() {
                $result=preg_match('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html);
                return $result>0;
        }
        private function removeWhiteSpace ($string) {
                $string = str_replace(array("\n","\r",'&nbsp;',"\t"),'',$string);
                return preg_replace('|  +|', ' ', $string);
        }
        public function toArray($html='') {

                //first part: coding
                if ($html!='') {
                        $this->html = $this->removeWhiteSpace($html);
                }
                while ($this->existSimpleTags() || $this->existSingleTags()) {
                        $this->replaceSingleTags();
                        $this->replaceSimpleTags();
                }
                $this->replaceRemainingTags();

                //now decoding
                $ar=$this->getArray($this->html);

                return $ar;
        }
        private function getArrayOfReplacements($str) {
                $final=array();
                $ar=explode($this->separator,$str);
                for ($i=0;$i<(count($ar)-1)/2;$i++) {
                        $final []= $this->separator.$ar[$i*2+1].$this->separator;
                }
                return $final;
        }
        private function startsWithText($str) {
                $first=substr(trim(str_replace(array("\n","\r"),'',$str)),0,1);
                if ($first=='<' || $first=='>') return false;
                return true;
        }
        private function strInArray($array,$str) {
                foreach ($array as $item) {
                        if (strpos($str,$item)!==false)
                                return true;
                }
                return false;
        }
        private function getArray($html, $father='') {
                $final=array();
                if (strpos($html,$this->separator)!==false) {
                        $r=$this->getArrayOfReplacements($html);
                        foreach ($r as $i) {

                                $ar = explode($this->separator,$i);
                                $ar = explode('-',$ar[1]);
                                $elem = $this->levelArray[$ar[0]][$ar[1]];
                                $this->levelArray[$ar[0]][$ar[1]]['father'] = $father;

                                $final []= array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $father, 'childNodes' => $this->getArray($elem['text'],$i));
                        }
                }
                return $final;
        }
        public function loadNode($rep) {
                $elem = $this->getElement($rep);
                return array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $elem['father']);
        }
}

if (isset($_REQUEST['layout'])) {
        $layout = new layout($_REQUEST['layout']);
        $console = $layout->renderConsole();
        $json_file = $layout->getJSON_FILE();
        $toString = $layout->toString();
        $getBody = "http://" . $_SERVER['SERVER_NAME']."/etramp/scripts/" . $layout->get_SAVE_TO_HTML();
} else {
        $console = "";
        $json_file = "";
        $toString = "";
        $getBody = "";
}

?>

<html>
<head>
        <title>Vuly Layout Checker</title>
        <style type="text/css">
        html {
                height: 100%;
                margin:0;padding:0;
        }
        body {
                background: #728eaa;
                background: -moz-linear-gradient(top, #25303C 0%, #728EAA 100%);
                background: -webkit-gradient(linear, left top, left bottom, color-stop(0%, #25303C), color-stop(100%, #728EAA));
                font-family: sans-serif;
        }
        input, select {
                padding:10px;
        }
        select, input[type='submit'] {
                cursor:pointer;
        }
        label {
                color: #fff;
                padding-right: 10px;
        }
        form {
                margin: 50px auto 0 auto;
                width: 684px;
        }
        .text1 {
                width:49%; height:220px; resize: none; position:fixed; top:150px;
        }
        .text2 {
                width:49%; resize: none; position:fixed; top: 380px; bottom:10px; height: 58%;
        }
        </style>
        <script src="//ajax.googleapis.com/ajax/libs/jquery/1.10.1/jquery.min.js"></script>
        <script type="text/javascript">
        var file;
        $( document ).ready(function() {
        file = <?php echo $json_file; ?>;
                recurse(file);
        });
        function recurse(file) {
                console.log(file[i].tag);
                for (var i = 0; i < file.length; i++) {
                        if (file[i].childNodes) {
                                if (file[i].tag == "body") {
                                        console.log($(file[i].innerHeml, $('#NEW_LAYOUT').contents()));
                                        alert(file[i].tag);
                                } else {
                                        recurse(file[i].childNodes);
                                }              
                        }
                }
        }
        </script>
</head>
<body>

        <textarea class="text1" style="left:10px;"><?php echo $console; ?></textarea>
        <textarea class="text1" style="right:10px;"><?php echo $json_file; ?></textarea>

        <form>
                <label for="layout">Website URL:</label>
                <input type="text" name="layout" id="layout" style="width: 500px" value="<?php echo (isset($_REQUEST['layout'])) ? $_REQUEST['layout'] : "http://"; ?>">
                <input type="submit">
        </form>

        <textarea class="text2" style="left:10px;"><?php echo $toString; ?></textarea>
        <iframe id="NEW_LAYOUT" class="text2" style="right:10px;" src="<?php echo $getBody; ?>"></iframe>

</body>
</html>
4

1 に答える 1

0

phpQuery https://code.google.com/p/phpquery/のようなものをお勧めします

したがって、その URL を cURL し、その結果を次のように phpQuery に渡します。

phpQuery::selectDocument($doc);

例として、HTML を繰り返し処理する方法を次に示します // 最後に選択した DOM からのすべての LI

foreach(pq('li') as $li) {
        // iteration returns PLAIN dom nodes, NOT phpQuery objects
        $tagName = $li->tagName;
        $childNodes = $li->childNodes;
        // so you NEED to wrap it within phpQuery, using pq();
        pq($li)->addClass('my-second-new-class');
}
于 2013-06-20T02:56:24.643 に答える