Attachment 'SimpleHTMLParser.py'
Download 1 # SimpleHTMLParser
2 # This is a very simple parser intended to extract the body text, title, and
3 # a list of URLs. It is intended to be as lenient as possible, regarding
4 # HTML standards and poorly formed HTML.
5 #
6 # See http://www.winwaed.com/info/python_html/parser.shtml for more details.
7 #
8 # Copyright and License Notice
9 # ----------------------------
10 # Copyright (C) 2006 by Winwaed Software Technology. All rights reserved.
11 # Some individual files may be covered by other copyrights.
12 #
13 # This material was originally written and compiled by Richard Marsden
14 # of Winwaed Software Technology 2004-6.
15 #
16 # Redistribution and use in source and binary forms, with or without
17 # modification, are permitted provided that this entire copyright notice
18 # is duplicated in all such copies.
19 #
20 # This software is provided "as is" and without any expressed or implied
21 # warranties, including, without limitation, the implied warranties of
22 # merchantibility and fitness for any particular purpose.
23 #
24 # Winwaed Software Technology
25 # http://www.winwaed.com
26
27 from string import join, split
28 from urlparse import urljoin
29
30 import htmlentitydefs
31 import re
32
33
34
35 # ############################
36 # HTML Parsing Objects, etc.
37
38 def simplify_text(sbuff):
39 str = ""
40 already_sp = 1
41 for c in sbuff:
42 if c.isspace():
43 if not already_sp:
44 already_sp = 1
45 str = str + " "
46 else:
47 str = str + c
48 already_sp = 0
49 return str.strip();
50
51 ###################
52 # This is a basic parsing class. It should be sub-classed and have new
53 # methods to over-ride the handler methods. See SimpleText HTMLParser
54 # for this.
55
56 class BaseSimpleHTMLParser:
57 def __init__(self):
58 self.intext="";
59 def handle_textData(self, text):
60 pass
61 def handle_endDocument(self):
62 pass
63 def handle_startDocument(self):
64 pass
65 def handle_startTag(self, tag, attrs):
66 pass
67 def handle_startEndTag(self, tag, attrs):
68 pass
69 def handle_endTag(self, tag):
70 pass
71 def handle_entityTag(self, tag):
72 # Note: this might be called with "entity candidates" - ie. poorly written
73 # entities, or "&" characters that are not properly escaped
74 # The inheriting object should try to interpret the entity. If this fails,
75 # it should be interpreted as text data
76 pass
77
78 def parse(self,sInput):
79 self.intext=sInput;
80 self.iptr = 0;
81 sData = ""
82 self.handle_startDocument();
83 while (self.iptr<len(self.intext) ):
84 ch = self.intext[self.iptr];
85 if (ch=='<'):
86 # output and empty the data buffer
87 if (len(sData)>0):
88 self.handle_textData(sData)
89 sData = ""
90 # get the new tag and interpret
91 tag = self.fetchTagToClosedAngle();
92 if tag[0] == '?':
93 # DOCTYPE or similar - do nothing
94 pass
95 elif tag[0]=='!':
96 if (tag[1]=='-' and tag[2]=='-'):
97 # comment - do nothing
98 pass
99 else:
100 # DOCTYPE or similar - do nothing
101 pass
102 elif (tag[0]=='/'):
103 # closed tag
104 endTag = tag[1:].strip().lower();
105 self.handle_endTag(endTag);
106 else:
107 # tag
108 self.parseTag(tag)
109 elif (ch=='&'):
110 # output and empty the data buffer
111 if (len(sData)>0):
112 self.handle_textData(sData)
113 sData = ""
114 # Fetch the Entity Tag
115 tag = self.fetchEntityTag();
116 if (len(tag)>0):
117 self.handle_entityTag(tag)
118 else:
119 sData = sData + ch;
120 self.iptr=self.iptr+1;
121 # End of document
122 self.handle_endDocument();
123
124 def fetchTagToClosedAngle(self):
125 ch = '/'
126 st = ""
127 while (ch!='>' and self.iptr<len(self.intext) ):
128 self.iptr = self.iptr + 1
129 ch = self.intext[self.iptr]
130 if (ch!='>'):
131 st = st + ch
132 return st;
133
134 def fetchEntityTag(self):
135 ch = 'A' # dummy char
136 st = ""
137 while (ch.isalnum() and self.iptr<len(self.intext) ):
138 self.iptr = self.iptr + 1
139 ch = self.intext[self.iptr]
140 if (ch.isalnum()):
141 st = st + ch
142 return st;
143
144 def skipToEndScriptTag(self):
145 kptr=0
146 escript = "</script>"
147 while ( kptr<9 and self.iptr<len(self.intext)-1 ):
148 self.iptr = self.iptr + 1
149 if self.intext[self.iptr].lower() == escript[kptr]:
150 kptr = kptr + 1
151 else:
152 kptr = 0
153
154 def parseTag(self,tag):
155 sbuff = tag.strip()
156 bStartEnd=0
157 attribs = {}
158 if (sbuff[len(sbuff)-1]=='/'):
159 bStartEnd=1
160 sbuff = sbuff[:len(sbuff)]
161
162 tmatch = re.compile(r'\w+')
163 tm = tmatch.search(sbuff)
164 ipp = 0
165 if (tm):
166 output_tag = tm.group().lower()
167 sbuff = sbuff[ tm.end() : ]
168 attr_match = re.compile(r'(\w+)\s*=\s*"([^"]*)"')
169 m = attr_match.findall( sbuff)
170 if (m):
171 # extracted attributes=>create dictionary
172 for aa in m:
173 attribs[ aa[0] ] = aa[1]
174 else:
175 output_tag = sbuff.lower()
176 if (bStartEnd):
177 self.handle_startEndTag(output_tag, attribs )
178 else:
179 if (output_tag=="script"):
180 # Skip the script tag and its contents
181 self.skipToEndScriptTag()
182 else:
183 self.handle_startTag(output_tag, attribs )
184
185
186 ####################
187 # This is the main parser, which implements some of the handlers
188
189 class SimpleTextHTMLParser(BaseSimpleHTMLParser):
190 def __init__(self):
191 self.mytext="";
192 self.bodyText="";
193 self.titleText="";
194 self.bInTitle = 0;
195 self.listURLs = [];
196 self.fullUniqueURLs = {}
197 self.sFullURL=""
198 def parse(self,sInput,sURL):
199 self.sFullURL = sURL
200 BaseSimpleHTMLParser.parse(self,sInput)
201 def handle_textData(self, text):
202 self.mytext = self.mytext + text;
203 def handle_entityTag(self, tag):
204 # Note: Unicode is passed through as an entity code
205 try:
206 if (tag[0]=='#'):
207 # entity ASCII code
208 chval = 0
209 if (tag[1]=='x'):
210 chval = int(tag[2:],16)
211 else:
212 chval = int(tag[1:],10)
213 if chval>255:
214 self.mytext = self.mytext + "&" + tag + ";"
215 else:
216 self.mytext = self.mytext + chr(chval)
217 else: # entity symbol
218 self.mytext = self.mytext + htmlentitydefs.entitydefs[tag];
219 except:
220 # The above will fail if the entity is actually a poorly escaped '&'
221 # If this is the case, record it as text data
222 self.mytext = self.mytext + "&" + tag
223 if (self.iptr<len(self.intext)):
224 self.mytext = self.mytext + self.intext[self.iptr]
225
226 def handle_startDocument(self):
227 self.mytext="";
228 def handle_endDocument(self):
229 self.bodyText = simplify_text(self.mytext)
230 self.mytext=""
231 self.listURLs = self.fullUniqueURLs.keys()
232 def handle_startTag(self, tag, attrs):
233 if tag=="title":
234 self.bInTitle = 1
235 self.mytext=""
236 elif tag=="a":
237 if attrs.has_key("href"):
238 sub_url = attrs["href"]
239 url = split( urljoin(self.sFullURL,sub_url), '#')[0]
240 if (url[len(url)-1]=='/'):
241 url = url[ : len(url)-1]
242 if url[:5] == "http:":
243 self.fullUniqueURLs[url] = 1
244 elif tag=="body":
245 if (self.bInTitle):
246 self.bInTitle = 0
247 self.titleText=simplify_text(self.mytext);
248 self.myText = ""
249 def handle_startEndTag(self, tag, attrs):
250 pass
251 def handle_endTag(self, tag):
252 if (tag=="title"):
253 self.titleText = simplify_text(self.mytext);
254 self.myText = ""
255 self.bInTitle = 0
256 elif (tag=="head"):
257 if (self.bInTitle):
258 self.bInTitle = 0
259 self.titleText=simplify_text(self.mytext);
260 self.myText = ""
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.