0

DOMに関連する奇妙なバグがあります。ドキュメント内のすべてのhrefを繰り返し処理し、必要に応じて絶対パスに置き換えようとしています。問題は、を使用した後、変更された値$dom->setttribute()getAttribute返すことです。それでも、saveHTML()getElementsByTagNameとgetAttributeを使用してタグを再度クエリすると、値はhttp://example.com/path.php?cccからhttp://example.com切り捨てられます。

これが私のコードです:

<?php
//include 'url_to_absolute.php';


function url_to_absolute($url, $href) {
    return trim($url . $href);
}

 $url = 'http://example.com';
 //$url = $_GET["url"];
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$contents = curl_exec($ch);
@curl_close();

$dom = new DOMDocument();
$dom->loadHTML($contents);


//change the urls to absolute
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
    $href = $anchor->getAttribute('href');
    $abs = url_to_absolute($url, $href);
    $anchor->removeAttribute('href');
    $anchor->setAttribute('href', $abs);

    //changed
    $newhref = $anchor->getAttribute('href');
    echo "newhref = " . $newhref; //shows http://example.com/.... (good)
}


$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
    echo "new2 = " . $anchor->getAttribute('href'); //returns http://example.com only
}

//print output
echo @$dom->saveHTML();
?>
4

2 に答える 2

0

次のcurlオプションを試してください+curl_init($ url):

<?php
//include 'url_to_absolute.php';
function url_to_absolute($url, $href){
    return trim($url . $href);
}

$url = 'http://example.com';
//$url = $_GET["url"];
$ch = curl_init($url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION, TRUE);
$contents = curl_exec($ch);
curl_close();

$dom = new DOMDocument();
$dom->loadHTML($contents);
//$dom->saveHTMLFile('dom_doc_test.html');


//change the urls to absolute
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
    $href = $anchor->getAttribute('href');
    $abs = url_to_absolute($url, $href);
    $anchor->removeAttribute('href');
    $anchor->setAttribute('href', $abs);

    //changed
    $newhref = $anchor->getAttribute('href') . '<br />';
    echo "newhref = " . $newhref; //shows http://example.com/.... (good)
}

$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
    echo "new2 = " . $anchor->getAttribute('href') . '<br />'; //returns http://example.com only
}

//print output
echo @$dom->saveHTML();
?>
于 2012-06-21T09:05:32.540 に答える
0

url_to_absolute関数のバグであるはずです。私の単純なurl_to_absoluteは次のとおりです。

function url_to_absolute($url, $href){
    return trim($url . $href);
}

$url = 'http://example.com';

$dom = new DOMDocument();
$dom->loadHTML('<html><body><a href="/path.html?q=hello&a=bye"></a><a href="/path2.html?before=34&after=44"></a></body></html>');

$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor){
    $href = $anchor->getAttribute('href');
    echo "href = " . $href . '<br />';
}

echo '<br />';

$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor){
    $href = $anchor->getAttribute('href');
    $abs = url_to_absolute($url, $href);
    $anchor->removeAttribute('href');
    $anchor->setAttribute('href', $abs);  

    $newhref = $anchor->getAttribute('href');
    echo "newhref = " . $newhref . '<br />';
}

echo '<br />';

$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor){
    echo "new2 = " . $anchor->getAttribute('href') . '<br />';
}

結果は次のとおりです。

href = /path.html?q=hello&a=bye
href = /path2.html?before=34&after=44

newhref = http://example.com/path.html?q=hello&a=bye
newhref = http://example.com/path2.html?before=34&after=44

new2 = http://example.com/path.html?q=hello&a=bye
new2 = http://example.com/path2.html?before=34&after=44
于 2012-06-20T19:55:43.050 に答える