最新文章
微信公众号文章PHP采集程序
2024-12-21 17:52
<?php class Gather { private $url; private $path; private function ksort($arr) { foreach ( $arr as $value ) { $temp [] = $value; } return $temp; } public function __construct($url, $path) { $this->url = $url; $this->path = $path; // 设置脚本执行不超时 set_time_limit ( 0 ); } public function fetch() { return $this->transform ( $this->url, $this->path ); } private function createPic($url, $path, $name) { $img = file_get_contents ( $url ); $info = getimagesize ( $url ); $type = str_replace ( 'image/', '', $info ['mime'] ); $fileName = $path . DIRECTORY_SEPARATOR . $name . ".$type"; file_put_contents ( $fileName, $img ); return $fileName; } private function transform($url, $path) { if (! file_exists ( $path )) mkdir ( $path ); $data ['url'] = $url; // 文章URL $content = file_get_contents ( $url ); preg_match ( '/<title>(.*)</title>/i', $content, $result ); $data ['title'] = $result [1]; // 文章标题 preg_match ( '/vars+msg_cdn_urls*=s*"([^s]*)"/', $content, $result ); $data ['cover'] = $result [1]; // 封面 preg_match ( '/vars+nicknames*=s*"([^s]*)"/', $content, $result ); $data ['nickname'] = $result [1]; // 公众号昵称 preg_match ( '/vars+cts*=s*"([^s]*)"/', $content, $result ); $data ['ct'] = $result [1]; // 公众号发布的时间戳 preg_match ( '/vars+user_names*=s*"([^s]*)"/', $content, $result ); $data ['user_name'] = $result [1]; // 公众号的原始ID preg_match ( '/vars+round_head_imgs*=s*"([^s]*)"/', $content, $result ); $data ['round_head_img'] = $this->createPic ( $result [1], $path, "round_head_img_" . $data ['user_name'] ); // // 公众号头像 preg_match ( "/s?__biz=(.*)&mid=/i", $url, $result ); $data ['bizId'] = $result [1]; // 公众号BizId preg_match ( '/vars+msg_descs*=s*"([^s]*)"/', $content, $result ); $data ['msg_desc'] = $result [1]; // 公众号文章摘要 // 获取微信主体内容 preg_match ( '/<divs+class="rich_media_contents*"s+id="js_content">(.*?)</div>/is', $content, $result ); //注意非贪婪的? // 精细化筛选 // preg_match_all ( '/data-src="([a-zA-z]+://[^s]*mmbiz/([^s]*)/d+?)[^s]*=([^s]*)"|data-src="([a-zA-z]+://[^s]*mmbiz/([^s]*)/d+)"|background-images*:s*urls*(s*([a-zA-z]+://[^s]*mmbiz/([^s]*)/d+)s*)|background-images*:s*urls*(s*([a-zA-z]+://[^s]*mmbiz/([^s]*)/d+?)[^s]*=([^s]*)s*)/is', $result [1], $result2 ); // 获取微信主体中的防盗链图片(含css背景图片)内容 preg_match_all ( '/data-src="[a-zA-z]+://[^s]*[mmbiz|mmbiz_jpg]/[^s]*/d+?[^s]*=[^s]*"|data-src="[a-zA-z]+://[^s]*[mmbiz|mmbiz_jpg]/[^s]*/d+"|background-images*:s*urls*(s*[a-zA-z]+://[^s]*mmbiz/[^s]*/d+|background-images*:s*urls*(s*[a-zA-z]+://[^s]*mmbiz/[^s]*/d+?[^s]*=[^s]*/is', $result [1], $result2 ); // 判断微信主体中是否包含防盗链图片 if (! empty ( $result2 [0] )) { foreach ( $result2 [0] as $value ) { // 取出防盗链地址中的data-src值后的http://url主体部分 preg_match ( '/[a-zA-z]+://[^s]*/[mmbiz|mmbiz_jpg]/([^s/]*)/d+?[^s"]*|[a-zA-z]+://[^s]*[mmbiz|mmbiz_jpg]/([^s/]*)/d+/', $value, $temp ); $temp = array_filter ( $temp ); $temp = $this->ksort ( $temp ); $urlList [] = $temp [0]; $nameList [] = $temp [1]; } foreach ( $urlList as $value ) { $name = array_shift ( $nameList ); $fileName = $this->createPic ( $value, $path, $name ); // 保存为本地图片 $result [1] = str_replace ( $value, $fileName, $result [1] ); } } // 更新所有data-src的地址 $result [1] = str_replace ( "data-src", "src", $result [1] ); // 返回处理后的微信主体内容。 $data ['content'] = trim($result [1]); return $data; }
    以上就是本篇文章【微信公众号文章PHP采集程序】的全部内容了,欢迎阅览 ! 文章地址:http://ww.kub2b.com/quote/8401.html 
     栏目首页      相关文章      动态      同类文章      热门文章      网站地图      返回首页 企库往资讯移动站http://ww.kub2b.com/mobile/,查看更多   
发表评论
0评