Revision 9 as of 2010-01-20 10:41:28

Clear message

# # Copyright (c) 2004, 2005 Google Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # * Neither the name of Google nor the names of its contributors may # be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # # The sitemap_gen.py script is written in Python 2.2 and released to # the open source community for continuous improvements under the BSD # 2.0 new license, which can be found at: # # http://www.opensource.org/licenses/bsd-license.php #

usage = \ """A simple script to automatically produce sitemaps for a webserver, in the Google Sitemap Protocol (GSP).

Usage: python sitemap_gen.py --config=config.xml [--help] [--testing]

"""

# Please be careful that all syntax used in this file can be parsed on # Python 1.5 -- this version check is not evaluated until after the # entire file has been parsed. import sys if sys.hexversion < 0x02020000:

import fnmatch import glob import gzip import md5 import os import re import stat import time import types import urllib import urlparse import xml.sax

# True and False were introduced in Python2.2.2 try:

except NameError:

# Text encodings ENC_ASCII = 'ASCII' ENC_UTF8 = 'UTF-8' ENC_IDNA = 'IDNA' ENC_ASCII_LIST = ['ASCII', 'US-ASCII', 'US', 'IBM367', 'CP367', 'ISO646-US'

ENC_DEFAULT_LIST = ['ISO-8859-1', 'ISO-8859-2', 'ISO-8859-5']

# Maximum number of urls in each sitemap, before next Sitemap is created MAXURLS_PER_SITEMAP = 50000

# Suffix on a Sitemap index file SITEINDEX_SUFFIX = '_index.xml'

# Regular expressions tried for extracting URLs from access logs. ACCESSLOG_CLF_PATTERN = re.compile(

# Match patterns for lastmod attributes LASTMOD_PATTERNS = map(re.compile, [

# Match patterns for changefreq attributes CHANGEFREQ_PATTERNS = [

# XML formats SITEINDEX_HEADER = \

SITEINDEX_FOOTER = '</sitemapindex>\n' SITEINDEX_ENTRY = \

SITEMAP_HEADER = \

SITEMAP_FOOTER = '</urlset>\n' SITEURL_XML_PREFIX = ' <url>\n' SITEURL_XML_SUFFIX = ' </url>\n'

# Search engines to notify with the updated sitemaps # # This list is very non-obvious in what's going on. Here's the gist: # Each item in the list is a 6-tuple of items. The first 5 are "almost" # the same as the input arguments to urlparse.urlunsplit(): # 0 - schema # 1 - netloc # 2 - path # 3 - query <-- EXCEPTION: specify a query map rather than a string # 4 - fragment # Additionally, add item 5: # 5 - query attribute that should be set to the new Sitemap URL # Clear as mud, I know. NOTIFICATION_SITES = [

class Error(Exception):

#end class Error

class SchemaError(Error):

#end class SchemeError

class Encoder:

#end class Encoder encoder = Encoder()

class Output:

#end class Output output = Output()

class URL(object):

#end class URL

class Filter:

#end class Filter

class InputURL:

#end class InputURL

class InputURLList:

#end class InputURLList

class InputDirectory:

#end class InputDirectory

class InputAccessLog:

#end class InputAccessLog

class InputSitemap(xml.sax.handler.ContentHandler):

#end class InputSitemap

class FilePathGenerator:

#end class FilePathGenerator

class PerURLStatistics:

class Sitemap(xml.sax.handler.ContentHandler):

#end class Sitemap

def ValidateAttributes(tag, attributes, goodattributes):

#end def ValidateAttributes

def ExpandPathAttribute(src, attrib):

#end def ExpandPathAttribute

def OpenFileForRead(path, logtext):

#end def OpenFileForRead

def TimestampISO8601(t):

#end def TimestampISO8601

def CreateSitemapFromFile(configpath, suppress_notify):

#end def CreateSitemapFromFile

def ProcessCommandFlags(args):

#end def ProcessCommandFlags

# # main #

if name == 'main':

Unable to edit the page? See the FrontPage for instructions.