Attachment 'SimpleHTMLParser.py'

Download

   1 # SimpleHTMLParser
   2 # This is a very simple parser intended to extract the body text, title, and
   3 # a list of URLs. It is intended to be as lenient as possible, regarding
   4 # HTML standards and poorly formed HTML.
   5 #
   6 # See http://www.winwaed.com/info/python_html/parser.shtml for more details.
   7 #
   8 # Copyright and License Notice
   9 # ----------------------------
  10 # Copyright (C) 2006 by Winwaed Software Technology.  All rights reserved.  
  11 # Some individual files may be covered by other copyrights.
  12 #
  13 # This material was originally written and compiled by Richard Marsden 
  14 # of Winwaed Software Technology 2004-6.
  15 #
  16 # Redistribution and use in source and binary forms, with or without
  17 # modification, are permitted provided that this entire copyright notice
  18 # is duplicated in all such copies.
  19 # 
  20 # This software is provided "as is" and without any expressed or implied
  21 # warranties, including, without limitation, the implied warranties of
  22 # merchantibility and fitness for any particular purpose.
  23 #
  24 # Winwaed Software Technology
  25 # http://www.winwaed.com
  26 
  27 from string import join, split
  28 from urlparse import urljoin
  29 
  30 import htmlentitydefs
  31 import re
  32 
  33 
  34 
  35 # ############################
  36 # HTML Parsing Objects, etc.
  37 
  38 def simplify_text(sbuff):
  39     str = ""
  40     already_sp = 1
  41     for c in sbuff:
  42         if c.isspace():
  43             if not already_sp:
  44                 already_sp = 1
  45                 str = str + " "
  46         else:
  47             str = str + c
  48             already_sp = 0
  49     return str.strip();
  50 
  51 ###################
  52 # This is a basic parsing class. It should be sub-classed and have new
  53 # methods to over-ride the handler methods. See SimpleText HTMLParser
  54 # for this.
  55 
  56 class BaseSimpleHTMLParser:
  57     def __init__(self):
  58         self.intext="";
  59     def handle_textData(self, text):
  60         pass
  61     def handle_endDocument(self):
  62         pass
  63     def handle_startDocument(self):
  64         pass
  65     def handle_startTag(self, tag, attrs):
  66         pass
  67     def handle_startEndTag(self, tag, attrs):
  68         pass
  69     def handle_endTag(self, tag):
  70         pass
  71     def handle_entityTag(self, tag):
  72         # Note: this might be called with "entity candidates" - ie. poorly written
  73         # entities, or "&" characters that are not properly escaped
  74         # The inheriting object should try to interpret the entity. If this fails,
  75         # it should be interpreted as text data
  76         pass
  77 
  78     def parse(self,sInput):
  79         self.intext=sInput;
  80         self.iptr = 0;
  81         sData = ""
  82         self.handle_startDocument();
  83         while (self.iptr<len(self.intext) ):
  84             ch = self.intext[self.iptr];
  85             if (ch=='<'):
  86                 # output and empty the data buffer
  87                 if (len(sData)>0):
  88                     self.handle_textData(sData)
  89                     sData = ""
  90                 # get the new tag and interpret
  91                 tag = self.fetchTagToClosedAngle();
  92                 if tag[0] == '?':
  93                     # DOCTYPE or similar - do nothing
  94                     pass
  95                 elif tag[0]=='!':
  96                     if (tag[1]=='-' and tag[2]=='-'):
  97                         # comment - do nothing
  98                         pass
  99                     else:
 100                         # DOCTYPE or similar - do nothing 
 101                         pass
 102                 elif (tag[0]=='/'):
 103                     # closed tag
 104                     endTag = tag[1:].strip().lower();
 105                     self.handle_endTag(endTag);
 106                 else:
 107                     # tag
 108                     self.parseTag(tag)
 109             elif (ch=='&'):
 110                 # output and empty the data buffer
 111                 if (len(sData)>0):
 112                     self.handle_textData(sData)
 113                     sData = ""
 114                 # Fetch the Entity Tag
 115                 tag = self.fetchEntityTag();
 116                 if (len(tag)>0):
 117                     self.handle_entityTag(tag)
 118             else:
 119                 sData = sData + ch;
 120             self.iptr=self.iptr+1;
 121         # End of document
 122         self.handle_endDocument();        
 123 
 124     def fetchTagToClosedAngle(self):
 125         ch = '/'
 126         st = ""
 127         while (ch!='>' and self.iptr<len(self.intext) ):
 128             self.iptr = self.iptr + 1
 129             ch = self.intext[self.iptr]
 130             if (ch!='>'):
 131                 st = st + ch
 132         return st;
 133 
 134     def fetchEntityTag(self):
 135         ch = 'A'   # dummy char
 136         st = ""
 137         while (ch.isalnum() and self.iptr<len(self.intext) ):
 138             self.iptr = self.iptr + 1
 139             ch = self.intext[self.iptr]
 140             if (ch.isalnum()):
 141                 st = st + ch
 142         return st;
 143     
 144     def skipToEndScriptTag(self):
 145         kptr=0
 146         escript = "</script>"
 147         while ( kptr<9 and self.iptr<len(self.intext)-1 ):
 148             self.iptr = self.iptr + 1
 149             if self.intext[self.iptr].lower() == escript[kptr]:
 150                 kptr = kptr + 1
 151             else:
 152                 kptr = 0
 153 
 154     def parseTag(self,tag):
 155         sbuff = tag.strip()
 156         bStartEnd=0
 157         attribs = {}
 158         if (sbuff[len(sbuff)-1]=='/'):
 159             bStartEnd=1
 160             sbuff = sbuff[:len(sbuff)]
 161 
 162         tmatch = re.compile(r'\w+')
 163         tm = tmatch.search(sbuff)
 164         ipp = 0
 165         if (tm):
 166         	output_tag = tm.group().lower()
 167         	sbuff = sbuff[ tm.end() : ]
 168         	attr_match = re.compile(r'(\w+)\s*=\s*"([^"]*)"')
 169         	m = attr_match.findall( sbuff)
 170         	if (m):
 171         		# extracted attributes=>create dictionary
 172         		for aa in m:
 173         			attribs[ aa[0] ] = aa[1]
 174         else:
 175         	output_tag = sbuff.lower()
 176         if (bStartEnd):
 177             self.handle_startEndTag(output_tag, attribs )            
 178         else:
 179             if (output_tag=="script"):
 180                 # Skip the script tag and its contents
 181                 self.skipToEndScriptTag()
 182             else:
 183                 self.handle_startTag(output_tag, attribs )            
 184 
 185 
 186 ####################
 187 # This is the main parser, which implements some of the handlers
 188 
 189 class SimpleTextHTMLParser(BaseSimpleHTMLParser):
 190     def __init__(self):
 191         self.mytext="";
 192         self.bodyText="";
 193         self.titleText="";
 194         self.bInTitle = 0;
 195         self.listURLs = [];
 196         self.fullUniqueURLs = {}
 197         self.sFullURL=""
 198     def parse(self,sInput,sURL):
 199         self.sFullURL = sURL
 200         BaseSimpleHTMLParser.parse(self,sInput)
 201     def handle_textData(self, text):
 202         self.mytext = self.mytext + text;
 203     def handle_entityTag(self, tag):
 204         # Note: Unicode is passed through as an entity code
 205         try:
 206             if (tag[0]=='#'):
 207                 # entity ASCII code
 208                 chval = 0
 209                 if (tag[1]=='x'):
 210                     chval = int(tag[2:],16)
 211                 else:
 212                     chval = int(tag[1:],10)
 213                 if chval>255:
 214                     self.mytext = self.mytext + "&" + tag + ";"
 215                 else:
 216                     self.mytext = self.mytext + chr(chval)
 217             else:   # entity symbol            
 218                 self.mytext = self.mytext + htmlentitydefs.entitydefs[tag];
 219         except:
 220             # The above will fail if the entity is actually a poorly escaped '&'
 221             # If this is the case, record it as text data
 222             self.mytext = self.mytext + "&" + tag
 223             if (self.iptr<len(self.intext)):
 224                 self.mytext = self.mytext + self.intext[self.iptr]
 225             
 226     def handle_startDocument(self):
 227         self.mytext="";
 228     def handle_endDocument(self):
 229         self.bodyText = simplify_text(self.mytext)
 230         self.mytext=""
 231         self.listURLs = self.fullUniqueURLs.keys()
 232     def handle_startTag(self, tag, attrs):
 233         if tag=="title":
 234             self.bInTitle = 1
 235             self.mytext=""
 236         elif tag=="a":
 237             if attrs.has_key("href"):
 238                 sub_url = attrs["href"]
 239                 url = split( urljoin(self.sFullURL,sub_url), '#')[0] 
 240                 if (url[len(url)-1]=='/'):
 241                     url = url[ : len(url)-1]
 242                 if url[:5] == "http:":
 243                     self.fullUniqueURLs[url] = 1
 244         elif tag=="body":
 245             if (self.bInTitle):
 246                 self.bInTitle = 0
 247                 self.titleText=simplify_text(self.mytext);
 248             self.myText = ""
 249     def handle_startEndTag(self, tag, attrs):
 250         pass    
 251     def handle_endTag(self, tag):
 252         if (tag=="title"):
 253             self.titleText = simplify_text(self.mytext);
 254             self.myText = ""
 255             self.bInTitle = 0
 256         elif (tag=="head"):
 257             if (self.bInTitle):
 258                 self.bInTitle = 0
 259                 self.titleText=simplify_text(self.mytext);
 260             self.myText = ""

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2007-01-15 10:35:03, 8.8 KB) [[attachment:SimpleHTMLParser.py]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.

Unable to edit the page? See the FrontPage for instructions.