议题作者:amxku
信息来源:邪恶八进制信息安全团队(
www.eviloctal.com)
PHP版采集程序核心代码
复制内容到剪贴板
代码:
<?
//$url="[url]http://www.csnexp.com[/url]";
$url=$_GET["url"];
if ($url<>"")
{
$f=getFileContents($url);
print_r(htmlspecialchars($f["file"]));
}
function getFileContents($url)
{
$user_agent="User-Agent: Mozilla/4.0 (compatible; MSIE 5.5; Windows 98)";
$urlparts = parse_url($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if ($urlparts['query'] != "")
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = "port";
}
$all = "*/*";
$request = "GET $path HTTP/1.0\r\nHost: $host$portq\r\nAccept: $all\r\nAccept-Encoding: identity\r\nUser-Agent: $user_agent\r\n\r\n";
$fsocket_timeout = 30;
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$errno = 0;
$errstr = "";
$fp = @ fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
if (!$fp) {
$contents['state'] = "NOHOST";
print "Error: $errstr";
return $contents;
} else {
if (!fputs($fp, $request)) {
$contents['state'] = "Cannot send request";
return $contents;
}
$data = null;
socket_set_timeout($fp, $fsocket_timeout);
$status = socket_get_status($fp);
while (!feof($fp) && !$status['timed_out']) {
$data .= fgets($fp, 8192);
}
fclose($fp);
if ($status['timed_out'] == 1) {
$contents['state'] = "timeout";
} else
$contents['state'] = "ok";
$contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
}
return $contents;
}
?>对于这种情况,很多站长触手无策,目前我想到的有两种方法可以稍加防范,
1:文章生成静态HTML页面。命名以长一点的时间和随机数或随机字符为文件名,如果您的服务器容量够大,数据多的时候生成静态页还可以减轻服务器负载。
2:在文章内容里加入随机长度的随机字符,并让他跟网页底色重合,这样不影响正常用户的浏览,也可以比较好的阻止采集程序对你内容的采集,因为字符长度不好确定。:)
希望朋友们也说说自己对数据采集的看法。
有没有什么更好的办法来防止PHP站点被采集呢?
高见?