WWW サーバー上にあるファイルの内容を読み取り/解析するときは、常に cURL を使用します。
<?php
class CurlTool {
public static $userAgents = array(
'FireFox3' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; pl; rv:1.9) Gecko/2008052906 Firefox/3.0',
'GoogleBot' => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'IE7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Netscape' => 'Mozilla/4.8 [en] (Windows NT 6.0; U)',
'Opera' => 'Opera/9.25 (Windows NT 6.0; U; en)'
);
public static $options = array(
CURLOPT_USERAGENT => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
CURLOPT_AUTOREFERER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FRESH_CONNECT => true,
CURLOPT_COOKIEJAR => "cookies.txt",
CURLOPT_COOKIEFILE => "cookies.txt",
CURLOPT_SSL_VERIFYPEER => false,
//CURLOPT_COOKIESESSION => false,
);
private static $proxyServers = array();
private static $proxyCount = 0;
private static $currentProxyIndex = 0;
public static $getinfo;
public static function addProxyServer($url) {
self::$proxyServers[] = $url;
++self::$proxyCount;
}
public static function fetchContent($url, $fields = null, $verbose = false) {
//print '*'.$fields.'*';
if (($curl = curl_init($url)) == false) {
throw new Exception("curl_init error for url $url.");
}
if (self::$proxyCount > 0) {
$proxy = self::$proxyServers[self::$currentProxyIndex++ % self::$proxyCount];
curl_setopt($curl, CURLOPT_PROXY, $proxy);
if ($verbose === true) {
echo "Reading $url [Proxy: $proxy] ... ";
}
} else if ($verbose === true) {
echo "Reading $url ... ";
}
//$verbose=TRUE;
//print_r($fields);
// debug_print_backtrace();
//url-ify the data for the POST
$fields_string = '';
if (is_array($fields))
foreach ($fields as $key => $value) {
if (empty($key))
continue;
$fields_string .= $key . '=' . urlencode($value) . '&';
if ($verbose === true) {
echo $key . ": " . $value;
}
}
rtrim($fields_string, '&');
if (count($fields) > 0) {
curl_setopt($curl, CURLOPT_POST, count($fields));
curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
}
if ($verbose === true) {
echo "Fields string $fields_string ... ";
}
curl_setopt_array($curl, self::$options);
$content = curl_exec($curl);
self::$getinfo = curl_getinfo($curl);
if ($content === false) {
throw new Exception("curl_exec error for url $url " . curl_error($curl));
}
curl_close($curl);
if ($verbose === true) {
echo "Done.\n";
}
$content = preg_replace('#\n+#', ' ', $content);
$content = preg_replace('#\s+#', ' ', $content);
return $content;
}
public static function downloadFile($url, $fileName, $fields = null, $verbose = false) {
if (($curl = curl_init($url)) == false) {
throw new Exception("curl_init error for url $url.");
}
if (self::$proxyCount > 0) {
$proxy = self::$proxyServers[self::$currentProxyIndex++ % self::$proxyCount];
curl_setopt($curl, CURLOPT_PROXY, $proxy);
if ($verbose === true) {
echo "Downloading $url [Proxy: $proxy] ... ";
}
} else if ($verbose === true) {
echo "Downloading $url ... ";
}
//url-ify the data for the POST
$fields_string = '';
if (is_array($fields))
foreach ($fields as $key => $value) {
if (empty($key))
continue;
$fields_string .= $key . '=' . urlencode($value) . '&';
}
rtrim($fields_string, '&');
curl_setopt($curl, CURLOPT_POST, count($fields));
curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
curl_setopt_array($curl, self::$options);
if (is_file($fileName)) {
$contents = file_get_contents($fileName, false, null, -1, 3 * 1024);
$pattern = "__VIEWSTATE";
if (strpos($contents, $pattern) === false) {
return $fileName;
}
}
// if (is_file($fileName)) {
// // make a HEAD request and try to get the file size HEAD
// // if they differ then redownload the file, otherwise no need
// curl_setopt($curl, CURLOPT_NOBODY, true);
// curl_setopt($curl, CURLOPT_HEADER, true);
// $ret = curl_exec($curl);
// //echo $fileName;
// $size = curl_getinfo($curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD);
// if ($size == filesize($fileName)) {
// return $fileName;
// } else {
// unlink($fileName);
// return self::downloadFile($url, $fileName, $fields, $verbose);
// }
// }
if (substr($fileName, -1) == '/') {
$targetDir = $fileName;
$fileName = tempnam(sys_get_temp_dir(), 'c_');
}
if (($fp = fopen($fileName, "w")) === false) {
throw new Exception("fopen error for filename $fileName");
}
curl_setopt($curl, CURLOPT_FILE, $fp);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
$ret = curl_exec($curl);
self::$getinfo = curl_getinfo($curl);
if ($ret === false) {
fclose($fp);
unlink($fileName);
throw new Exception("curl_exec error for url $url.");
} elseif (isset($targetDir)) {
$eurl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL);
preg_match('#^.*/(.+)$#', $eurl, $match);
fclose($fp);
rename($fileName, "$targetDir{$match[1]}");
$fileName = "$targetDir{$match[1]}";
} else {
fclose($fp);
}
curl_close($curl);
if ($verbose === true) {
echo "Done.\n";
}
return $fileName;
}
}
?>
cURL は常にブラウザ ヘッダーを送信するため、タイムアウトやリダイレクト、ダイ効果は発生しません。
ファイルをダウンロードするか、ファイルのコンテンツを取得するかを選択してください。