This file is indexed.

/usr/share/doc/diveintopython-zh/html/html_processing/index.html is in diveintopython-zh 5.4b-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
<!DOCTYPE html
  PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
   <head>
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   
      <title>&nbsp;8&nbsp;&nbsp;HTML 处理</title>
      <link rel="stylesheet" href="../diveintopython.css" type="text/css">
      <link rev="made" href="mailto:f8dy@diveintopython.org">
      <meta name="generator" content="DocBook XSL Stylesheets V1.52.2">
      <meta name="keywords" content="Python, Dive Into Python, tutorial, object-oriented, programming, documentation, book, free">
      <meta name="description" content="Python from novice to pro">
      <link rel="home" href="../toc/index.html" title="Dive Into Python">
      <link rel="up" href="../toc/index.html" title="Dive Into Python">
      <link rel="previous" href="../regular_expressions/summary.html" title="7.7.&nbsp;小结">
      <link rel="next" href="introducing_sgmllib.html" title="8.2.&nbsp;sgmllib.py 介绍">
   </head>
   <body>
      <table id="Header" width="100%" border="0" cellpadding="0" cellspacing="0" summary="">
         <tr>
            <td id="breadcrumb" colspan="5" align="left" valign="top">导航:<a href="../index.html">起始页</a>&nbsp;&gt;&nbsp;<a href="../toc/index.html">Dive Into Python</a>&nbsp;&gt;&nbsp;<span class="thispage">HTML 处理</span></td>
            <td id="navigation" align="right" valign="top">&nbsp;&nbsp;&nbsp;<a href="../regular_expressions/summary.html" title="上一页: “小结”">&lt;&lt;</a>&nbsp;&nbsp;&nbsp;<a href="introducing_sgmllib.html" title="下一页: “sgmllib.py 介绍”">&gt;&gt;</a></td>
         </tr>
         <tr>
            <td colspan="3" id="logocontainer">
               <h1 id="logo"><a href="../index.html" accesskey="1">深入 Python :Dive Into Python 中文版</a></h1>
               <p id="tagline">Python 从新手到专家 [Dip_5.4b_CPyUG_Release]</p>
            </td>
            <td colspan="3" align="right">
               <form id="search" method="GET" action="http://www.google.com/custom">
                  <p><label for="q" accesskey="4">Find:&nbsp;</label><input type="text" id="q" name="q" size="20" maxlength="255" value=""> <input type="submit" value="搜索"><input type="hidden" name="domains" value="woodpecker.org.cn"><input type="hidden" name="sitesearch" value="www.woodpecker.org.cn/diveintopython"></p>
               </form>
            </td>
         </tr>
      </table>
      <!--#include virtual="/inc/ads" -->
      <div class="chapter" lang="zh_cn">
         <div class="titlepage">
            <div>
               <div>
                  <h2 class="title"><a name="dialect"></a>&nbsp;8&nbsp;&nbsp;<span class="acronym">HTML</span> 处理
                  </h2>
               </div>
            </div>
            <div></div>
         </div>
         <div class="toc">
            <ul>
               <li><span class="section"><a href="index.html#dialect.divein">8.1. 概览</a></span></li>
               <li><span class="section"><a href="introducing_sgmllib.html">8.2. sgmllib.py 介绍</a></span></li>
               <li><span class="section"><a href="extracting_data.html">8.3. 从 HTML 文档中提取数据</a></span></li>
               <li><span class="section"><a href="basehtmlprocessor.html">8.4. BaseHTMLProcessor.py 介绍</a></span></li>
               <li><span class="section"><a href="locals_and_globals.html">8.5. locals 和 globals</a></span></li>
               <li><span class="section"><a href="dictionary_based_string_formatting.html">8.6. 基于 dictionary 的字符串格式化</a></span></li>
               <li><span class="section"><a href="quoting_attribute_values.html">8.7. 给属性值加引号</a></span></li>
               <li><span class="section"><a href="dialect.html">8.8. dialect.py 介绍</a></span></li>
               <li><span class="section"><a href="all_together.html">8.9. 全部放在一起</a></span></li>
               <li><span class="section"><a href="summary.html">8.10. 小结</a></span></li>
            </ul>
         </div>
         <div class="section" lang="zh_cn">
            <div class="titlepage">
               <div>
                  <div>
                     <h2 class="title"><a name="dialect.divein"></a>8.1.&nbsp;概览
                     </h2>
                  </div>
               </div>
               <div></div>
            </div>
            <div class="abstract">
               <p>
                  我经常在 <a href="http://groups.google.com/groups?group=comp.lang.python">comp.lang.python</a> 上看到关于如下的问题: “<span class="quote"> 怎么才能从我的 <span class="acronym">HTML</span> 文档中列出所有的 [头|图像|链接] 呢?</span>” “<span class="quote">怎么才能 [分析|解释|munge] 我的 <span class="acronym">HTML</span> 文档的文本,但是又要保留标记呢?</span>”  “<span class="quote">怎么才能一次给我所有的 <span class="acronym">HTML</span> 标记 [增加|删除|加引号] 属性呢?</span>” 本章将回答所有这些问题。
               </p>
            </div>
            <p>下面给出一个完整的,可工作的 <span class="application">Python</span> 程序,它分为两部分。第一部分,<tt class="filename">BaseHTMLProcessor.py</tt> 是一个通用工具,它可以通过扫描标记和文本块来帮助您处理 <span class="acronym">HTML</span> 文件。第二部分,<tt class="filename">dialect.py</tt> 是一个例子,演示了如何使用 <tt class="filename">BaseHTMLProcessor.py</tt> 来转化 <span class="acronym">HTML</span> 文档,保留文本但是去掉了标记。阅读文档字符串 (<tt class="literal">doc string</tt>) 和注释来了解将要发生事情的概况。大部分内容看上去像巫术,因为任一个这些类的方法是如何调用的不是很清楚。不要紧,所有内容都会按进度被逐步地展示出来。
            </p>
            <div class="example"><a name="dialect.basehtml.listing"></a><h3 class="title">&nbsp;8.1.&nbsp;<tt class="filename">BaseHTMLProcessor.py</tt></h3>
               <p>如果您还没有下载本书附带的样例程序, 可以 <a href="http://www.woodpecker.org.cn/diveintopython/download/diveintopython-exampleszh-cn-5.4b.zip" title="Download example scripts">下载本程序和其他样例程序</a></p><pre class="programlisting"><span class='pykeyword'>
from</span> sgmllib <span class='pykeyword'>import</span> SGMLParser
<span class='pykeyword'>import</span> htmlentitydefs

<span class='pykeyword'>class</span><span class='pyclass'> BaseHTMLProcessor</span>(SGMLParser):
    <span class='pykeyword'>def</span><span class='pyclass'> reset</span>(self):                       
        <span class='pycomment'># extend (called by SGMLParser.__init__)</span>
        self.pieces = []
        SGMLParser.reset(self)

    <span class='pykeyword'>def</span><span class='pyclass'> unknown_starttag</span>(self, tag, attrs):
        <span class='pycomment'># called for each start tag</span>
        <span class='pycomment'># attrs is a list of (attr, value) tuples</span>
        <span class='pycomment'># e.g. for &lt;pre class="screen"&gt;, tag="pre", attrs=[("class", "screen")]</span>
        <span class='pycomment'># Ideally we would like to reconstruct original tag and attributes, but</span>
        <span class='pycomment'># we may end up quoting attribute values that weren't quoted in the source</span>
        <span class='pycomment'># document, or we may change the type of quotes around the attribute value</span>
        <span class='pycomment'># (single to double quotes).</span>
        <span class='pycomment'># Note that improperly embedded non-HTML code (like client-side Javascript)</span>
        <span class='pycomment'># may be parsed incorrectly by the ancestor, causing runtime script errors.</span>
        <span class='pycomment'># All non-HTML code must be enclosed in HTML comment tags (&lt;!-- code --&gt;)</span>
        <span class='pycomment'># to ensure that it will pass through this parser unaltered (in handle_comment).</span>
        strattrs = <span class='pystring'>""</span>.join([<span class='pystring'>' %s="%s"'</span> % (key, value) <span class='pykeyword'>for</span> key, value <span class='pykeyword'>in</span> attrs])
        self.pieces.append(<span class='pystring'>"&lt;%(tag)s%(strattrs)s&gt;"</span> % locals())

    <span class='pykeyword'>def</span><span class='pyclass'> unknown_endtag</span>(self, tag):         
        <span class='pycomment'># called for each end tag, e.g. for &lt;/pre&gt;, tag will be "pre"</span>
        <span class='pycomment'># Reconstruct the original end tag.</span>
        self.pieces.append(<span class='pystring'>"&lt;/%(tag)s&gt;"</span> % locals())

    <span class='pykeyword'>def</span><span class='pyclass'> handle_charref</span>(self, ref):         
        <span class='pycomment'># called for each character reference, e.g. for "&amp;#160;", ref will be "160"</span>
        <span class='pycomment'># Reconstruct the original character reference.</span>
        self.pieces.append(<span class='pystring'>"&amp;#%(ref)s;"</span> % locals())

    <span class='pykeyword'>def</span><span class='pyclass'> handle_entityref</span>(self, ref):       
        <span class='pycomment'># called for each entity reference, e.g. for "&amp;copy;", ref will be "copy"</span>
        <span class='pycomment'># Reconstruct the original entity reference.</span>
        self.pieces.append(<span class='pystring'>"&amp;%(ref)s"</span> % locals())
        <span class='pycomment'># standard HTML entities are closed with a semicolon; other entities are not</span>
        <span class='pykeyword'>if</span> htmlentitydefs.entitydefs.has_key(ref):
            self.pieces.append(<span class='pystring'>";"</span>)

    <span class='pykeyword'>def</span><span class='pyclass'> handle_data</span>(self, text):           
        <span class='pycomment'># called for each block of plain text, i.e. outside of any tag and</span>
        <span class='pycomment'># not containing any character or entity references</span>
        <span class='pycomment'># Store the original text verbatim.</span>
        self.pieces.append(text)

    <span class='pykeyword'>def</span><span class='pyclass'> handle_comment</span>(self, text):        
        <span class='pycomment'># called for each HTML comment, e.g. &lt;!-- insert Javascript code here --&gt;</span>
        <span class='pycomment'># Reconstruct the original comment.</span>
        <span class='pycomment'># It is especially important that the source document enclose client-side</span>
        <span class='pycomment'># code (like Javascript) within comments so it can pass through this</span>
        <span class='pycomment'># processor undisturbed; see comments in unknown_starttag for details.</span>
        self.pieces.append(<span class='pystring'>"&lt;!--%(text)s--&gt;"</span> % locals())

    <span class='pykeyword'>def</span><span class='pyclass'> handle_pi</span>(self, text):             
        <span class='pycomment'># called for each processing instruction, e.g. &lt;?instruction&gt;</span>
        <span class='pycomment'># Reconstruct original processing instruction.</span>
        self.pieces.append(<span class='pystring'>"&lt;?%(text)s&gt;"</span> % locals())

    <span class='pykeyword'>def</span><span class='pyclass'> handle_decl</span>(self, text):
        <span class='pycomment'># called for the DOCTYPE, if present, e.g.</span>
        <span class='pycomment'># &lt;!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"</span>
        <span class='pycomment'>#     "http://www.w3.org/TR/html4/loose.dtd"&gt;</span>
        <span class='pycomment'># Reconstruct original DOCTYPE</span>
        self.pieces.append(<span class='pystring'>"&lt;!%(text)s&gt;"</span> % locals())

    <span class='pykeyword'>def</span><span class='pyclass'> output</span>(self):              
        <span class='pystring'>"""Return processed HTML as a single string"""</span>
        <span class='pykeyword'>return</span> <span class='pystring'>""</span>.join(self.pieces)</pre></div>
            <div class="example"><a name="d0e19875"></a><h3 class="title">&nbsp;8.2.&nbsp;<tt class="filename">dialect.py</tt></h3><pre class="programlisting"><span class='pykeyword'>
import</span> re
<span class='pykeyword'>from</span> BaseHTMLProcessor <span class='pykeyword'>import</span> BaseHTMLProcessor

<span class='pykeyword'>class</span><span class='pyclass'> Dialectizer</span>(BaseHTMLProcessor):
    subs = ()

    <span class='pykeyword'>def</span><span class='pyclass'> reset</span>(self):
        <span class='pycomment'># extend (called from __init__ in ancestor)</span>
        <span class='pycomment'># Reset all data attributes</span>
        self.verbatim = 0
        BaseHTMLProcessor.reset(self)

    <span class='pykeyword'>def</span><span class='pyclass'> start_pre</span>(self, attrs):            
        <span class='pycomment'># called for every &lt;pre&gt; tag in HTML source</span>
        <span class='pycomment'># Increment verbatim mode count, then handle tag like normal</span>
        self.verbatim += 1                 
        self.unknown_starttag(<span class='pystring'>"pre"</span>, attrs)

    <span class='pykeyword'>def</span><span class='pyclass'> end_pre</span>(self):                     
        <span class='pycomment'># called for every &lt;/pre&gt; tag in HTML source</span>
        <span class='pycomment'># Decrement verbatim mode count</span>
        self.unknown_endtag(<span class='pystring'>"pre"</span>)         
        self.verbatim -= 1                 

    <span class='pykeyword'>def</span><span class='pyclass'> handle_data</span>(self, text):                                        
        <span class='pycomment'># override</span>
        <span class='pycomment'># called for every block of text in HTML source</span>
        <span class='pycomment'># If in verbatim mode, save text unaltered;</span>
        <span class='pycomment'># otherwise process the text with a series of substitutions</span>
        self.pieces.append(self.verbatim <span class='pykeyword'>and</span> text <span class='pykeyword'>or</span> self.process(text))

    <span class='pykeyword'>def</span><span class='pyclass'> process</span>(self, text):
        <span class='pycomment'># called from handle_data</span>
        <span class='pycomment'># Process text block by performing series of regular expression</span>
        <span class='pycomment'># substitutions (actual substitions are defined in descendant)</span>
        <span class='pykeyword'>for</span> fromPattern, toPattern <span class='pykeyword'>in</span> self.subs:
            text = re.sub(fromPattern, toPattern, text)
        <span class='pykeyword'>return</span> text

<span class='pykeyword'>class</span><span class='pyclass'> ChefDialectizer</span>(Dialectizer):
    <span class='pystring'>"""convert HTML to Swedish Chef-speak
    
    based on the classic chef.x, copyright (c) 1992, 1993 John Hagerman
    """</span>
    subs = ((r<span class='pystring'>'a([nu])'</span>, r<span class='pystring'>'u\1'</span>),
            (r<span class='pystring'>'A([nu])'</span>, r<span class='pystring'>'U\1'</span>),
            (r<span class='pystring'>'a\B'</span>, r<span class='pystring'>'e'</span>),
            (r<span class='pystring'>'A\B'</span>, r<span class='pystring'>'E'</span>),
            (r<span class='pystring'>'en\b'</span>, r<span class='pystring'>'ee'</span>),
            (r<span class='pystring'>'\Bew'</span>, r<span class='pystring'>'oo'</span>),
            (r<span class='pystring'>'\Be\b'</span>, r<span class='pystring'>'e-a'</span>),
            (r<span class='pystring'>'\be'</span>, r<span class='pystring'>'i'</span>),
            (r<span class='pystring'>'\bE'</span>, r<span class='pystring'>'I'</span>),
            (r<span class='pystring'>'\Bf'</span>, r<span class='pystring'>'ff'</span>),
            (r<span class='pystring'>'\Bir'</span>, r<span class='pystring'>'ur'</span>),
            (r<span class='pystring'>'(\w*?)i(\w*?)$'</span>, r<span class='pystring'>'\1ee\2'</span>),
            (r<span class='pystring'>'\bow'</span>, r<span class='pystring'>'oo'</span>),
            (r<span class='pystring'>'\bo'</span>, r<span class='pystring'>'oo'</span>),
            (r<span class='pystring'>'\bO'</span>, r<span class='pystring'>'Oo'</span>),
            (r<span class='pystring'>'the'</span>, r<span class='pystring'>'zee'</span>),
            (r<span class='pystring'>'The'</span>, r<span class='pystring'>'Zee'</span>),
            (r<span class='pystring'>'th\b'</span>, r<span class='pystring'>'t'</span>),
            (r<span class='pystring'>'\Btion'</span>, r<span class='pystring'>'shun'</span>),
            (r<span class='pystring'>'\Bu'</span>, r<span class='pystring'>'oo'</span>),
            (r<span class='pystring'>'\BU'</span>, r<span class='pystring'>'Oo'</span>),
            (r<span class='pystring'>'v'</span>, r<span class='pystring'>'f'</span>),
            (r<span class='pystring'>'V'</span>, r<span class='pystring'>'F'</span>),
            (r<span class='pystring'>'w'</span>, r<span class='pystring'>'w'</span>),
            (r<span class='pystring'>'W'</span>, r<span class='pystring'>'W'</span>),
            (r<span class='pystring'>'([a-z])[.]'</span>, r<span class='pystring'>'\1.  Bork Bork Bork!'</span>))

<span class='pykeyword'>class</span><span class='pyclass'> FuddDialectizer</span>(Dialectizer):
    <span class='pystring'>"""convert HTML to Elmer Fudd-speak"""</span>
    subs = ((r<span class='pystring'>'[rl]'</span>, r<span class='pystring'>'w'</span>),
            (r<span class='pystring'>'qu'</span>, r<span class='pystring'>'qw'</span>),
            (r<span class='pystring'>'th\b'</span>, r<span class='pystring'>'f'</span>),
            (r<span class='pystring'>'th'</span>, r<span class='pystring'>'d'</span>),
            (r<span class='pystring'>'n[.]'</span>, r<span class='pystring'>'n, uh-hah-hah-hah.'</span>))

<span class='pykeyword'>class</span><span class='pyclass'> OldeDialectizer</span>(Dialectizer):
    <span class='pystring'>"""convert HTML to mock Middle English"""</span>
    subs = ((r<span class='pystring'>'i([bcdfghjklmnpqrstvwxyz])e\b'</span>, r<span class='pystring'>'y\1'</span>),
            (r<span class='pystring'>'i([bcdfghjklmnpqrstvwxyz])e'</span>, r<span class='pystring'>'y\1\1e'</span>),
            (r<span class='pystring'>'ick\b'</span>, r<span class='pystring'>'yk'</span>),
            (r<span class='pystring'>'ia([bcdfghjklmnpqrstvwxyz])'</span>, r<span class='pystring'>'e\1e'</span>),
            (r<span class='pystring'>'e[ea]([bcdfghjklmnpqrstvwxyz])'</span>, r<span class='pystring'>'e\1e'</span>),
            (r<span class='pystring'>'([bcdfghjklmnpqrstvwxyz])y'</span>, r<span class='pystring'>'\1ee'</span>),
            (r<span class='pystring'>'([bcdfghjklmnpqrstvwxyz])er'</span>, r<span class='pystring'>'\1re'</span>),
            (r<span class='pystring'>'([aeiou])re\b'</span>, r<span class='pystring'>'\1r'</span>),
            (r<span class='pystring'>'ia([bcdfghjklmnpqrstvwxyz])'</span>, r<span class='pystring'>'i\1e'</span>),
            (r<span class='pystring'>'tion\b'</span>, r<span class='pystring'>'cioun'</span>),
            (r<span class='pystring'>'ion\b'</span>, r<span class='pystring'>'ioun'</span>),
            (r<span class='pystring'>'aid'</span>, r<span class='pystring'>'ayde'</span>),
            (r<span class='pystring'>'ai'</span>, r<span class='pystring'>'ey'</span>),
            (r<span class='pystring'>'ay\b'</span>, r<span class='pystring'>'y'</span>),
            (r<span class='pystring'>'ay'</span>, r<span class='pystring'>'ey'</span>),
            (r<span class='pystring'>'ant'</span>, r<span class='pystring'>'aunt'</span>),
            (r<span class='pystring'>'ea'</span>, r<span class='pystring'>'ee'</span>),
            (r<span class='pystring'>'oa'</span>, r<span class='pystring'>'oo'</span>),
            (r<span class='pystring'>'ue'</span>, r<span class='pystring'>'e'</span>),
            (r<span class='pystring'>'oe'</span>, r<span class='pystring'>'o'</span>),
            (r<span class='pystring'>'ou'</span>, r<span class='pystring'>'ow'</span>),
            (r<span class='pystring'>'ow'</span>, r<span class='pystring'>'ou'</span>),
            (r<span class='pystring'>'\bhe'</span>, r<span class='pystring'>'hi'</span>),
            (r<span class='pystring'>'ve\b'</span>, r<span class='pystring'>'veth'</span>),
            (r<span class='pystring'>'se\b'</span>, r<span class='pystring'>'e'</span>),
            (r<span class='pystring'>"'s\b"</span>, r<span class='pystring'>'es'</span>),
            (r<span class='pystring'>'ic\b'</span>, r<span class='pystring'>'ick'</span>),
            (r<span class='pystring'>'ics\b'</span>, r<span class='pystring'>'icc'</span>),
            (r<span class='pystring'>'ical\b'</span>, r<span class='pystring'>'ick'</span>),
            (r<span class='pystring'>'tle\b'</span>, r<span class='pystring'>'til'</span>),
            (r<span class='pystring'>'ll\b'</span>, r<span class='pystring'>'l'</span>),
            (r<span class='pystring'>'ould\b'</span>, r<span class='pystring'>'olde'</span>),
            (r<span class='pystring'>'own\b'</span>, r<span class='pystring'>'oune'</span>),
            (r<span class='pystring'>'un\b'</span>, r<span class='pystring'>'onne'</span>),
            (r<span class='pystring'>'rry\b'</span>, r<span class='pystring'>'rye'</span>),
            (r<span class='pystring'>'est\b'</span>, r<span class='pystring'>'este'</span>),
            (r<span class='pystring'>'pt\b'</span>, r<span class='pystring'>'pte'</span>),
            (r<span class='pystring'>'th\b'</span>, r<span class='pystring'>'the'</span>),
            (r<span class='pystring'>'ch\b'</span>, r<span class='pystring'>'che'</span>),
            (r<span class='pystring'>'ss\b'</span>, r<span class='pystring'>'sse'</span>),
            (r<span class='pystring'>'([wybdp])\b'</span>, r<span class='pystring'>'\1e'</span>),
            (r<span class='pystring'>'([rnt])\b'</span>, r<span class='pystring'>'\1\1e'</span>),
            (r<span class='pystring'>'from'</span>, r<span class='pystring'>'fro'</span>),
            (r<span class='pystring'>'when'</span>, r<span class='pystring'>'whan'</span>))

<span class='pykeyword'>def</span><span class='pyclass'> translate</span>(url, dialectName=<span class='pystring'>"chef"</span>):
    <span class='pystring'>"""fetch URL and translate using dialect
    
    dialect in ("chef", "fudd", "olde")"""</span>
    <span class='pykeyword'>import</span> urllib                      
    sock = urllib.urlopen(url)         
    htmlSource = sock.read()           
    sock.close()                       
    parserName = <span class='pystring'>"%sDialectizer"</span> % dialectName.capitalize()
    parserClass = globals()[parserName]                    
    parser = parserClass()                                 
    parser.feed(htmlSource)
    parser.close()         
    <span class='pykeyword'>return</span> parser.output() 

<span class='pykeyword'>def</span><span class='pyclass'> test</span>(url):
    <span class='pystring'>"""test all dialects against URL"""</span>
    <span class='pykeyword'>for</span> dialect <span class='pykeyword'>in</span> (<span class='pystring'>"chef"</span>, <span class='pystring'>"fudd"</span>, <span class='pystring'>"olde"</span>):
        outfile = <span class='pystring'>"%s.html"</span> % dialect
        fsock = open(outfile, <span class='pystring'>"wb"</span>)
        fsock.write(translate(url, dialect))
        fsock.close()
        <span class='pykeyword'>import</span> webbrowser
        webbrowser.open_new(outfile)

<span class='pykeyword'>if</span> __name__ == <span class='pystring'>"__main__"</span>:
    test(<span class='pystring'>"http://diveintopython.org/odbchelper_list.html"</span>)</pre></div>
            <div class="example"><a name="d0e19881"></a><h3 class="title">&nbsp;8.3.&nbsp;<tt class="filename">dialect.py</tt> 的输出结果
               </h3>
               <p>运行这个脚本会将 <a href="../native_data_types/lists.html" title="3.2.&nbsp;List 介绍">&nbsp;3.2&nbsp;节 “List 介绍”</a> 转换成<a href="../native_data_types/chef.html">模仿瑞典厨师用语 (mock Swedish Chef-speak)</a> (来自 The Muppets)、<a href="../native_data_types/fudd.html">模仿埃尔默唠叨者用语 (mock Elmer Fudd-speak)</a> (来自 Bugs Bunny 卡通画) 和<a href="../native_data_types/olde.html">模仿中世纪英语 (mock Middle English)</a> (零散地来源于乔叟的<i class="citetitle">《坎特伯雷故事集》</i>)。如果您查看输出页面的 <span class="acronym">HTML</span> 源代码,您会发现所有的 <span class="acronym">HTML</span> 标记和属性没有改动,但是在标记之间的文本被转换成模仿语言了。如果您观查得更仔细些,您会发现,实际上,仅有标题和段落被转换了;代码列表和屏幕例子没有改动。
               </p><pre class="programlisting">
&lt;div <span class='pykeyword'>class</span>="<span class='pyclass'>abstract</span>"&gt;
&lt;p&gt;Lists awe &lt;span <span class='pykeyword'>class</span>="<span class='pyclass'>application</span>"&gt;Pydon&lt;/span&gt;'s wowkhowse datatype.
If youw onwy expewience wif wists <span class='pykeyword'>is</span> awways <span class='pykeyword'>in</span>
&lt;span <span class='pykeyword'>class</span>="<span class='pyclass'>application</span>"&gt;Visuaw Basic&lt;/span&gt; ow (God fowbid) de datastowe
<span class='pykeyword'>in</span> &lt;span <span class='pykeyword'>class</span>="<span class='pyclass'>application</span>"&gt;Powewbuiwdew&lt;/span&gt;, bwace youwsewf fow
&lt;span <span class='pykeyword'>class</span>="<span class='pyclass'>application</span>"&gt;Pydon&lt;/span&gt; wists.&lt;/p&gt;
&lt;/div&gt;
</pre></div>
         </div>
      </div>
      <table class="Footer" width="100%" border="0" cellpadding="0" cellspacing="0" summary="">
         <tr>
            <td width="35%" align="left"><br><a class="NavigationArrow" href="../regular_expressions/summary.html">&lt;&lt;&nbsp;小结</a></td>
            <td width="30%" align="center"><br>&nbsp;<span class="divider">|</span>&nbsp;<span class="thispage">1</span> <span class="divider">|</span> <a href="introducing_sgmllib.html" title="8.2.&nbsp;sgmllib.py 介绍">2</a> <span class="divider">|</span> <a href="extracting_data.html" title="8.3.&nbsp;从 HTML 文档中提取数据">3</a> <span class="divider">|</span> <a href="basehtmlprocessor.html" title="8.4.&nbsp;BaseHTMLProcessor.py 介绍">4</a> <span class="divider">|</span> <a href="locals_and_globals.html" title="8.5.&nbsp;locals 和 globals">5</a> <span class="divider">|</span> <a href="dictionary_based_string_formatting.html" title="8.6.&nbsp;基于 dictionary 的字符串格式化">6</a> <span class="divider">|</span> <a href="quoting_attribute_values.html" title="8.7.&nbsp;给属性值加引号">7</a> <span class="divider">|</span> <a href="dialect.html" title="8.8.&nbsp;dialect.py 介绍">8</a> <span class="divider">|</span> <a href="all_together.html" title="8.9.&nbsp;全部放在一起">9</a> <span class="divider">|</span> <a href="summary.html" title="8.10.&nbsp;小结">10</a>&nbsp;<span class="divider">|</span>&nbsp;
            </td>
            <td width="35%" align="right"><br><a class="NavigationArrow" href="introducing_sgmllib.html">sgmllib.py 介绍&nbsp;&gt;&gt;</a></td>
         </tr>
         <tr>
            <td colspan="3"><br></td>
         </tr>
      </table>
      <div class="Footer">
         <p class="copyright">Copyright © 2000, 2001, 2002, 2003, 2004 <a href="mailto:mark@diveintopython.org">Mark Pilgrim</a></p>
         <p class="copyright">Copyright © 2001, 2002, 2003, 2004, 2005, 2006, 2007 <a href="mailto:python-cn@googlegroups.com">CPyUG (邮件列表)</a></p>
      </div>
   </body>
</html>