2、文档目录:配置文件 - config/config.ini.php sitemap主文件 - SiteMap.class.php3、主文件代码
* @version 1.0 */ namespace MaweibinguoSiteMap; class SiteMap { const SCHEMA = "http://www.sitemaps.org/schemas/sitemap/0.9"; /** * @var webUrlList * @access public */ public $webUrlList = array(); /** * @var siteMapList * @access public */ public $siteMapList = array(); /** * @var isUseCookie * @access public */ public $isUseCookie = false; /** * @var cookieFilePath * @access public */ public $cookieFilePath = ""; /** * @var xmlWriter * @access private */ private $_xmlWriter = ""; /** * init basic config * * @access public */ public function __construct() { $this->_xmlWriter = new XMLWriter(); $result = $this->_enviromentTest(); } /** * test the enviroment for the script * * @access pirvate */ private function _enviromentTest() { $sapiType = php_sapi_name (); if( strtolower($sapiType) != "cli" ) { echo " The Script Must Run In Command Lines ", " "; exit(); } } /** * load the configValue for genrating sitemap by configname * * @param string $configName * @return string $configValue * @access public */ public function loadConfig($configName) { /* init return value */ $configValue = ""; /* load config value */ $configPath = __DIR__ . "/config/config.ini.php"; if(file_exists( $configPath )) { require $configPath; } else { echo "Can not find config file", " "; exit(); } $configValue = $$configName; /* return config value */ return $configValue; } /** * generate sitemap.xml for the web * * @param siteMapList * @access public */ public function generateSiteMapXml($siteMapList) { /* init return result */ $result = false; if( !is_array($siteMapList) || count($siteMapList) <= 0 ) { echo "The SiteMap Cotent Is Empty"," "; exit(); } /* check the parameter */ $siteMapPath = $this->loadConfig("SITEMAPPATH"); if(!file_exists($siteMapPath)) { $commandStr = "touch ${siteMapPath}"; exec($commandStr); } if( !is_writable($siteMapPath) ) { echo "Is Not Writeable"," "; exit(); } $this->_xmlWriter->openURI($siteMapPath); $this->_xmlWriter->startDocument("1.0", "UTF-8"); $this->_xmlWriter->setIndent(true); $this->_xmlWriter->startElement("urlset"); $this->_xmlWriter->writeAttribute("xmlns", self::SCHEMA); foreach($siteMapList as $siteMapKey => $siteMapItem) { $this->_xmlWriter->startElement("url"); $this->_xmlWriter->writeElement("loc",$siteMapItem["Url"]); $this->_xmlWriter->writeElement("title",$siteMapItem["Title"]); $changefreq = !empty($siteMapItem["ChangeFreq"]) ? $siteMapItem["ChangeFreq"] : "Daily"; $this->_xmlWriter->writeElement("changefreq",$changefreq); $priority = !empty($siteMapItem["Priority"]) ? $siteMapItem["Priority"] : 0.5; $this->_xmlWriter->writeElement("priority",$priority); $this->_xmlWriter->endElement(); } $this->_xmlWriter->endElement(); /* return return */ return $result; } /** * start to send request to the target url, and get the reponse * * @param string $targetUrl * @return mixed $returnData * @access public */ public function sendRequest($url) { /* init return value */ $responseData = false; /* check the parameter */ if( !filter_var($url, FILTER_VALIDATE_URL) ) { return $responseData; } $connectTimeOut = $this->loadConfig("CURLOPT_CONNECTTIMEOUT"); if( $connectTimeOut === false ) { return $responseData; } $timeOut = $this->loadConfig("CURLOPT_TIMEOUT"); if( $timeOut === false ) { return $responseData; } $handle = curl_init(); curl_setopt($handle, CURLOPT_URL, $url); curl_setopt($handle, CURLOPT_HEADER, false); curl_setopt($handle, CURLOPT_AUTOREFERER, true); curl_setopt($handle, CURLOPT_RETURNTRANSFER , true); curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, $connectTimeOut); curl_setopt($handle, CURLOPT_TIMEOUT, $timeOut); curl_setopt($handle, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)" ); $headersItem = array( "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Connection: Keep-Alive" ); curl_setopt($handle, CURLOPT_HTTPHEADER, $headersItem); curl_setopt($handle, CURLOPT_FOLLOWLOCATION, 1); $cookieList = $this->loadConfig("COOKIELIST"); $isUseCookie = $cookieList["IsUseCookie"]; $cookieFilePath = $cookieList["CookiePath"]; if($isUseCookie) { if(!file_exists($cookieFilePath)) { $touchCommand = " touch {$cookieFilePath} "; exec($touchCommand); } curl_setopt($handle, CURLOPT_COOKIEFILE, $cookieFilePath); curl_setopt($handle, CURLOPT_COOKIEJAR, $cookieFilePath); } $responseData = curl_exec($handle); $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE); if($httpCode != 200) { $responseData = false; } curl_close($handle); /* return response data */ return $responseData; } /** * get the sitemap content of the url, it contains url, title, priority, changefreq * * @param string $url * @access public */ public function generateSiteMapList($url) { $content = $this->sendRequest($url); if($content !== false) { $tagsList = $this->_parseContent($content, $url); $urlItem = $tagsList["UrlItem"]; $title = $tagsList["Title"]; $siteMapItem = array( "Url" => trim($url), "Title" => trim($title) ); $priority = $this->_calculatePriority($siteMapItem["Url"]); $siteMapItem["Priority"] = $priority; $changefreq = $this->_calculateChangefreq($siteMapItem["Url"]); $siteMapItem["ChangeFreq"] = $changefreq; $this->siteMapList[] = $siteMapItem; foreach($urlItem as $nextUrl) { if( !in_array($nextUrl, $this->webUrlList) ) { $skipUrlList = $this->loadConfig("SKIP_URLLIST"); foreach($skipUrlList as $keyWords) { if( stripos($nextUrl, $keyWords) !== false ) { continue 2; } } $this->webUrlList[] = $nextUrl; echo $nextUrl," "; $this->generateSiteMapList($nextUrl); } } } } /** *teChangefreq get sitemaplist of the web * * @access public * @return array $siteMapList */ public function getSiteMapList() { return $this->siteMapList; } /** * calate the priority of the targeturl * * @param string $targetUrl * @return float $priority * @access private */ private function _calculatePriority($targetUrl) { /* init priority */ $priority = 0.5; /* calculate the priority */ if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) { $priorityList = $this->loadConfig("PRIORITYLIST"); foreach($priorityList as $priorityKey => $priorityValue) { if(stripos($targetUrl, $priorityKey) !== false) { $priority = $priorityValue; break; } } } /* return priority */ return $priority; } /** * calate the changefreq of the targeturl * * @param string $targetUrl * @return float $changefreq * @access private */ private function _calculateChangefreq($targetUrl) { /* init changefreq*/ $changefreq = "Daily"; /* calculate the priority */ if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) { $changefreqList = $this->loadConfig("CHANGEFREQLIST"); foreach($changefreqList as $changefreqKey => $changefreqValue) { if(stripos($targetUrl, $changefreqKey) !== false) { $changefreq = $changefreqValue; break; } } } /* return priority */ return $changefreq; } /** * format url * * @param $url * @param $orginUrl * @access private * @return $formatUrl */ private function _formatUrl($url, $originUrl) { /* init url */ $formatUrl = ""; /* format url */ if( !empty($url) && !empty($originUrl) ) { $badUrlItem = array( "", "/" , "javascript", "javascript:;", "" ); $formatUrl = trim($url); $formatUrl = trim($formatUrl, "#"); $formatUrl = trim($formatUrl, """); $formatUrl = trim($formatUrl, """); if(stripos($formatUrl, "http") === false && !in_array($formatUrl, $badUrlItem)) { if(strpos($formatUrl, "/") === 0) { $domainName = $this->loadConfig("DOMAIN_NAME"); $formatUrl = $domainName . trim($formatUrl, "/"); } else { $formatUrl = substr( $originUrl, 0, strrpos($originUrl, "/") ) ."/". $formatUrl; } } elseif( stripos($formatUrl, "http") === false && in_array($formatUrl, $badUrlItem) ) { $formatUrl = ""; } } /* return url */ return $formatUrl; } /** * check domain is right * * @param $url * @return $url * @access private */ private function _checkDomain($url) { /* init url */ $result = false; /* check domain */ if($url) { $domainName = $this->loadConfig("DOMAIN_NAME"); if( stripos($url, $domainName) === false ) { return $result; } $result = true; } /* return url */ return $result; } /** * parse the response content, so that we can get the urls * * @param string $content * @param string $originUrl * @return array $urlItem * @access public */ public function _parseContent($content, $originUrl) { /* init return data */ $tagsList = array(); /* start parse */ if( !empty($content) && !empty($originUrl) ) { $domainName = $this->loadConfig("DOMAIN_NAME"); /* get the attribute of href for tags */ $regStrForTagA = "#4、配置文件代码$url) { $formatUrl = $this->_formatUrl($url, $originUrl); if( empty($formatUrl) ) { unset($urlItem[$urlKey]); continue; } $result = $this->_checkDomain($formatUrl); if($result === false) { unset($urlItem[$urlKey]); continue; } $urlItem[$urlKey] = $formatUrl; } } $tagsList["UrlItem"] = $urlItem; /* get the title tags content */ $regStrForTitle = "#(.*?)#um"; if( preg_match($regStrForTitle, $content, $matches) ) { $title = $matches[1]; } $tagsList["Title"] = $title; } /* return tagsList */ return $tagsList; } } /* here is a example */ $startTime = microtime(true); echo "/***********************************************************************/"," "; echo "/* start to run {$startTime} */"," "; echo "/***********************************************************************/"," "; $siteMap = new SiteMap(); $domain = $siteMap->loadConfig("DOMAIN_NAME"); $siteMap->generateSiteMapList($domain); $siteMapList = $siteMap->getSiteMapList(); $siteMap->generateSiteMapXml($siteMapList); $endTime = microtime(true); $takeTime = $endTime - $startTime; echo "/***********************************************************************/"," "; echo "/* Had Done, it total take {$takeTime} */"," "; echo "/***********************************************************************/"," "; ?>
true, "CookiePath" => "/tmp/sitemapcookie" ); //sitemap文件的保存地址 $SITEMAPPATH = "./sitemap.xml"; //根据连接关键字设置priority $PRIORITYLIST = array( "product" => "0.8", "device" => "0.6", "intelligent" => "0.4", "course" => "0.2" ); //根据连接关键字设置CHANGEFREQ $CHANGEFREQLIST = array( "product" => "Always", "device" => "Hourly", "intelligent" => "Daily", "course" => "Weekly", "login" => "Monthly", "about" => "Yearly" ); ?>5、获取源码包
单击下载源代码 (提取码:fc1c)
