文章摘要
这篇文章介绍了使用Python语言和Pymysql库,从数据库中提取超过1万条URL信息,并自动生成新的sitemap文件的过程。文章详细描述了如何连接数据库、提取URL数据、处理数据并分批生成sitemap文件,确保数据量大的情况下能够高效处理。核心内容是通过自动化脚本实现sitemap文件的生成和管理。
#coding:utf-8
#python生成sitemap,超过1万条数据自动生成新文件。
#from __future__ import division
#
import os,datetime
import sys
import pymysql.cursors
reload(sys)
sys.setdefaultencoding('utf-8')
hosts = '域名/'
dir = os.popen('mkdir /data/wwwroot/forwei/www/sitemaps')
path = '/data/wwwroot/forwei/www/sitemaps/'
paths = 'sitemaps/'
lastmod = datetime.date.today()
connection = pymysql.connect(host="127.0.0.1",user="用户名",password="密码",db="表名")
sql = 'SELECT classpath FROM phome_enewsclass union select ztpath from phome_enewszt union SELECT titleurl FROM phome_ecms_news'
try:
with connection.cursor() as cursor:
cursor.execute(sql)
cnm = cursor.fetchall()
pan = open('urls.txt',"w")
#hu = open('mobile_url.txt',"w")
for i in cnm:
for item in i:
if len(item) > 2:
if item[0] == "/" :
pan.write("域名/%s\n" % item[1:])
#hu.write("域名/%s\n" % item[1:])
else:
if item[0] != "/":
pan.write("域名/%s\n" % item)
#hu.write("移动端/%s\n" % item)
pan.close()
#hu.close()
cursor.close()
finally:
connection.close()
def add_file(j,f1,hosts,paths):
file_name = 'sitemap_%s.xml'%(j)
f1.write("\n<sitemap>\n<loc>%s%s%s</loc>\n<lastmod>%s</lastmod>\n<priority>0.8</priority>\n</sitemap>"%(hosts,paths,file_name,lastmod))
f=open("%s%s"%(path,file_name),"w")
f.write('<?xml version="1.0" encoding="utf-8"?>\n<urlset>')
return f
#判断总的URL数
c = 0
for i in open('urls.txt'):
url = i.strip()
if len(url)==0:
pass
else:
c+=1
print c
#判断需要生成的sitemap个数
file_num = c000
if file_num==0:
file_num = c/10000
print '总共有%s条URL,生成%s个sitemap文件'%(c,file_num)
else:
file_num = (c/10000)+1
print '总共有%s条URL,生成%s个sitemap文件'%(c,file_num)
#自动按1W条URL生成sitemap,并自动命名为sitemap_1.xml
i = 0
j = 2
f = open('%s/sitemap_1.xml'%(path),'w+')
f.write('<?xml version="1.0" encoding="utf-8"?>\n<urlset>')
f1 = open('%s/sitemapindex.xml'%(path),'w')
f1.write('<?xml version="1.0" encoding="utf-8"?>\n<sitemapindex>')
f1.write("\n<sitemap>\n<loc>%s%s%s</loc>\n<lastmod>%s</lastmod>\n<priority>0.8</priority>\n</sitemap>"%(hosts,paths,'sitemap_1.xml',lastmod))
for url in open("urls.txt"):
url = url.strip()
i += 1
if i == 10000 or j == 10000:
f.write('\n</urlset>')
f.close()
i = 0
f = add_file(j,f1,hosts,paths)
j += 1
f.write("\n<url>\n<loc>%s</loc>\n<lastmod>%s</lastmod>\n<priority>0.8</priority>\n</url>"%(url,lastmod))
f.write('\n</urlset>')
f1.write('\n</sitemapindex>')
f1.close()

Villain博客
