phpQuery采集新闻例子

作者: admin 分类: php 发布时间: 2017-07-18 06:44

找了一个网站把新闻采集下来,这种窃取别人成果的做法我一般是不建议使用的,可以用作学习教程
以此网站为例 http://www.ssj.org.cn/news/这个网站提供了站点地图很容易就可以提取出来url

获取新闻url之后采集就很方便了

<?php 

@header("Content-Type: text/html; charset=UTF-8");
//不限执行时间
set_time_limit(0); 
error_reporting(1024);
//从日志文件中获取url
function getUrls($fileName){
	$lineArray=array();
	$fp = fopen($fileName, "rb") or die("Unable to open file ".$fileName);
	
	while(! feof($fp))
	{
		$line=fgets($fp);
		$line=trim($line);
		$lineArr[]=$line;

	}

	fclose($fp);
	return $lineArr;

}


//print_r($urls);

//根据url获取内容
function getSource($url){
	$title="";
	$content="";
	

	require_once('phpQuery/phpQuery.php');
	$html=phpQuery::newDocumentFile($url);         
	//获取描述内容
	$title=$html->find('.news_a h2')->text();
	$content= $html->find(".news_a")->remove();
	
	$content->find('.news_a_t_a')->remove();
	$content->find('.news_a_t')->remove();
	$content->find('a')->remove();
	
	$content->find('#news_next_b')->remove();
	$content->find('#other-news')->remove();

	$content->find('script')->remove();
	$content->find('#ckepop')->remove();

	$content= $content->text();	

	$search = array(        
	                    "~豫弘~is",
	                     "~河南~is",
	                     "~关键字:\|\|~is"
	                                      
	                                );
	                                
	$replace = array(        
	                    "世邦",
	                    "上海",
	                    ""
	                                       
	                                );                        
	
	$content = preg_replace($search,$replace,$content);
	$content=mb_convert_encoding($content, 'utf-8',mb_detect_encoding($content));    
	$title=mb_convert_encoding($title, 'utf-8',mb_detect_encoding($title));
	$title= addslashes($title);
	$title = preg_replace($search,$replace,$title);
	$content=addslashes($content);
	
	return array('title'=>$title,'content'=>$content);

}



function outPut($title,$content,$dir=""){

	if(!empty($dir)){
		if(!file_exists($dir))
			mkdir ($dir,0777) or die("mkdir failed");
			chmod($dir,0777);

	}
	
	if(!empty($dir)){
		$fnTitle=$dir."/".time().".txt";
	}else{
		$fnTitle=time().".txt";
	}
	

	$fp = fopen($fnTitle,'wb') or die("open ".$fnTitle." fail !"); 
	
	@flock($fp ,LOCK_EX );
	fwrite($fp,$title.PHP_EOL) or die('write '.$fnTitle." fail !");
	fwrite($fp,$content) or die('write '.$fnTitle." fail !");
	@flock($fp, LOCK_UN);
	fclose($fp);
	echo "write ".$fnTitle." success"."</br>";
	
			

}

$urls=getUrls('caijiurls.txt');
foreach ($urls as $url) {
	# code...
	$source=getSource($url);
	print_r($source);
	$title=$source['title'];
	$content=$source['content'];
	outPut($title,$content,'./test');
}


//$source=getSource('http://www.ssj.org.cn/news/201272092156.html');

//print_r($source);
 ?>

采集结果如下

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表评论

电子邮件地址不会被公开。 必填项已用*标注