解决http请求字符乱码的bug
上次介绍的http请求函数足够用了,可是在某些时候从浏览器打开url是正常的,使用request_url却只得到一堆乱码,于是翻看手册发现,是由于该url所在的服务器开启了gzip压缩导致。使用curl扩展进行请求时记得开启gzip自动解压
curl_setopt($ch, CURLOPT_ENCODING, 'gzip')
另外,如果没有使用curl,而是使用而是使用fsockopen或file_get_contents,需要手动解压,当PHP>=5.4时,使用gzdecode函数对这些乱码解码,当PHP<5.4的时候,需要自行实现解压函数(可以在PHP手册上找到)。
/** * 发送http请求 * @param $url 请求地址 * @param $postFields HTTP方法为POST时的请求参数 * @return string HTTP请求相应结果 */ function request_url($url, $postFields = null) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_FAILONERROR, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //https 请求 if(strlen($url) > 5 && strtolower(substr($url,0,5)) == 'https') { curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); } //定义一批浏览器UA $browser = array( 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36', ); //随机选择一个UA $default_browser = $browser[array_rand($browser)]; curl_setopt($ch, CURLOPT_USERAGENT, $default_browser); curl_setopt($ch, CURLOPT_REFERER, $url); //防止乱码 curl_setopt($ch, CURLOPT_ENCODING, 'gzip'); //判断是否为POST请求 if (is_array($postFields) && 0 < count($postFields)) { $postBodyString = ""; $postMultipart = false; foreach ($postFields as $k => $v) { if("@" != substr($v, 0, 1))//判断是不是文件上传 { $postBodyString .= "$k=" . urlencode($v) . "&"; } else//文件上传用multipart/form-data,否则用www-form-urlencoded { $postMultipart = true; } } unset($k, $v); curl_setopt($ch, CURLOPT_POST, true); if ($postMultipart) { curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields); } else { curl_setopt($ch, CURLOPT_POSTFIELDS, substr($postBodyString,0,-1)); } } $reponse = curl_exec($ch); //如果采集失败请用try{}catch(){}捕获 if (curl_errno($ch)) { throw new Exception(curl_error($ch),0); } else { $httpStatusCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); //如果采集失败请用try{}catch(){}捕获 if (200 !== $httpStatusCode) { throw new Exception($reponse,$httpStatusCode); } } curl_close($ch); return $reponse; } //gzip解压函数,亲测 function _gzdecode($data) { $len = strlen($data); if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) { return $data; // Not GZIP format (See RFC 1952) } $method = ord(substr($data,2,1)); // Compression method $flags = ord(substr($data,3,1)); // Flags if($flags & 31 != $flags) { // Reserved bits are set -- NOT ALLOWED by RFC 1952 return $data; } // NOTE: $mtime may be negative (PHP integer limitations) $mtime = unpack('V', substr($data,4,4)); $mtime = $mtime[1]; $xfl = substr($data,8,1); $os = substr($data,8,1); $headerlen = 10; $extralen = 0; $extra = ''; if($flags & 4) { // 2-byte length prefixed EXTRA data in header if($len - $headerlen - 2 < 8) { return $data; // Invalid format } $extralen = unpack('v',substr($data,8,2)); $extralen = $extralen[1]; if($len - $headerlen - 2 - $extralen < 8) { return $data; // Invalid format } $extra = substr($data,10,$extralen); $headerlen += 2 + $extralen; } $filenamelen = 0; $filename = ''; if($flags & 8) { // C-style string file NAME data in header if ($len - $headerlen - 1 < 8) { return $data; // Invalid format } $filenamelen = strpos(substr($data,8+$extralen),chr(0)); if($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) { return $data; // Invalid format } $filename = substr($data,$headerlen,$filenamelen); $headerlen += $filenamelen + 1; } $commentlen = 0; $comment = ''; if ($flags & 16) { // C-style string COMMENT data in header if ($len - $headerlen - 1 < 8) { return $data; // Invalid format } $commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0)); if($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) { return $data; // Invalid header format } $comment = substr($data,$headerlen,$commentlen); $headerlen += $commentlen + 1; } $headercrc = ''; if($flags & 2) { // 2-bytes (lowest order) of CRC32 on header present if ($len - $headerlen - 2 < 8) { return $data; // Invalid format } $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff; $headercrc = unpack('v', substr($data,$headerlen,2)); $headercrc = $headercrc[1]; if($headercrc != $calccrc) { return $data; // Bad header CRC } $headerlen += 2; } // GZIP FOOTER - These be negative due to PHP's limitations $datacrc = unpack('V',substr($data,-8,4)); $datacrc = $datacrc[1]; $isize = unpack('V',substr($data,-4)); $isize = $isize[1]; // Perform the decompression: $bodylen = $len-$headerlen-8; if ($bodylen < 1) { // This should never happen - IMPLEMENTATION BUG! return $data; } $body = substr($data,$headerlen,$bodylen); $data = ''; if($bodylen > 0) { switch($method) { case 8: // Currently the only supported compression method: $data = gzinflate($body); break; default: // Unknown compression method return $data; } }else{ // I'm not sure if zero-byte body content is allowed. // Allow it for now... Do nothing... } // Verifiy decompressed size and CRC32: // NOTE: This may fail with large data sizes depending on how // PHP's integer limitations affect strlen() since $isize // may be negative for large sizes. if($isize != strlen($data) || crc32($data) != $datacrc) { // Bad format! Length or CRC doesn't match! return $data; } return $data; };
开启curl_setopt($ch, CURLOPT_ENCODING, 'gzip')
或者使用_gzdecode函数都不会对未经gzip压缩的数据产生影响,可以放心使用。
教程地址:http://blog.zhengshuiguang.com/php/http-gz.html
欢迎转载!但请带上文章地址^^
评论已关闭