<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://mediawiki.zeropage.org/index.php?action=history&amp;feed=atom&amp;title=Kongulo</id>
	<title>Kongulo - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://mediawiki.zeropage.org/index.php?action=history&amp;feed=atom&amp;title=Kongulo"/>
	<link rel="alternate" type="text/html" href="https://mediawiki.zeropage.org/index.php?title=Kongulo&amp;action=history"/>
	<updated>2026-05-15T00:05:03Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.39.8</generator>
	<entry>
		<id>https://mediawiki.zeropage.org/index.php?title=Kongulo&amp;diff=33656&amp;oldid=prev</id>
		<title>imported&gt;Unknown at 05:23, 7 February 2021</title>
		<link rel="alternate" type="text/html" href="https://mediawiki.zeropage.org/index.php?title=Kongulo&amp;diff=33656&amp;oldid=prev"/>
		<updated>2021-02-07T05:23:36Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt; #!/usr/bin/env python&lt;br /&gt;
 &lt;br /&gt;
 # Copyright (c) 2005, Google Inc.&lt;br /&gt;
 # All rights reserved.&lt;br /&gt;
 # &lt;br /&gt;
 # Redistribution and use in source and binary forms, with or without&lt;br /&gt;
 # modification, are permitted provided that the following conditions are&lt;br /&gt;
 # met:&lt;br /&gt;
 # &lt;br /&gt;
 #     * Redistributions of source code must retain the above copyright&lt;br /&gt;
 # notice, this list of conditions and the following disclaimer.&lt;br /&gt;
 #     * Redistributions in binary form must reproduce the above&lt;br /&gt;
 # copyright notice, this list of conditions and the following disclaimer&lt;br /&gt;
 # in the documentation and/or other materials provided with the&lt;br /&gt;
 # distribution.&lt;br /&gt;
 #     * Neither the name of Google Inc. nor the names of its&lt;br /&gt;
 # contributors may be used to endorse or promote products derived from&lt;br /&gt;
 # this software without specific prior written permission.&lt;br /&gt;
 # &lt;br /&gt;
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS&lt;br /&gt;
 # &amp;quot;AS IS&amp;quot; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT&lt;br /&gt;
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR&lt;br /&gt;
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT&lt;br /&gt;
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,&lt;br /&gt;
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT&lt;br /&gt;
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,&lt;br /&gt;
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY&lt;br /&gt;
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT&lt;br /&gt;
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE&lt;br /&gt;
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.&lt;br /&gt;
 &lt;br /&gt;
 import robotparser&lt;br /&gt;
 import urllib&lt;br /&gt;
 import urllib2&lt;br /&gt;
 import re&lt;br /&gt;
 import sets&lt;br /&gt;
 import sys&lt;br /&gt;
 import urlparse&lt;br /&gt;
 import win32com.client&lt;br /&gt;
 import time&lt;br /&gt;
 import pywintypes&lt;br /&gt;
 import pythoncom&lt;br /&gt;
 import optparse&lt;br /&gt;
 import getpass&lt;br /&gt;
 import itertools&lt;br /&gt;
 import email.Utils&lt;br /&gt;
 &lt;br /&gt;
 &amp;#039;&amp;#039;&amp;#039;A simple web crawler that pushes pages into GDS.  Features include:&lt;br /&gt;
   - Knows basic and digest HTTP authentication&lt;br /&gt;
   - Obeys robots.txt&lt;br /&gt;
   - Can loop, recrawling over previously crawled pages every X minutes&lt;br /&gt;
   - When recrawling, uses If-Modified-Since HTTP header to minimize transfers&lt;br /&gt;
 &lt;br /&gt;
 For usage instructions, run with -h flag.&lt;br /&gt;
 &lt;br /&gt;
 Requires Python 2.4 and the win32all extensions for Python 2.4 on Windows.&lt;br /&gt;
 Will not work unless Google Desktop Search 1.0 or later is installed.&lt;br /&gt;
 &amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
 # Matches URLs in &amp;amp;lt;a href=...&amp;amp;gt; tags.  Chosen above htmllib.HTMLParser because&lt;br /&gt;
 # this is much more lenient, not requiring HTML to be valid.&lt;br /&gt;
 _LINK_RE = re.compile(r&amp;#039;&amp;amp;lt;\s*(a|img).+href\s*=\s*&amp;quot;?(.+?)&amp;quot;?(\s|&amp;amp;gt;)&amp;#039;,&lt;br /&gt;
                       re.MULTILINE | re.IGNORECASE)&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 # Matches &amp;amp;lt;frame src=&amp;quot;bla&amp;quot;&amp;amp;gt; tags.&lt;br /&gt;
 _FRAME_RE = re.compile(r&amp;#039;&amp;amp;lt;\s*(frame).+src\s*=\s*&amp;quot;?(.+?)&amp;quot;?(\s|&amp;amp;gt;)&amp;#039;,&lt;br /&gt;
                        re.MULTILINE | re.IGNORECASE)&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 # Digs out the text of an HTML document&amp;#039;s title.&lt;br /&gt;
 _TITLE_RE = re.compile(r&amp;#039;&amp;amp;lt;\s*title.*?&amp;amp;gt;(.+)&amp;amp;lt;/\s*title\s*&amp;amp;gt;&amp;#039;,&lt;br /&gt;
                        re.MULTILINE | re.IGNORECASE)&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 # This plugin&amp;#039;s GUID, used to register with GDS.&lt;br /&gt;
 _GUID = &amp;#039;{5e1788fe-a6e6-429f-816c-80cb969028d3}&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 class NoExceptionHandler(urllib2.BaseHandler):&lt;br /&gt;
   &amp;#039;&amp;#039;&amp;#039;An exception handler for HTTP that never throws an exception for various&lt;br /&gt;
   error codes that Kongulo always checks explicitly rather than catching them&lt;br /&gt;
   as exceptions.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
   def http_error_304(self, req, fp, code, msg, hdrs):&lt;br /&gt;
     &amp;#039;&amp;#039;&amp;#039;We handle not-modified-since explicitly.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
     return fp&lt;br /&gt;
   &lt;br /&gt;
   # We check error codes explicitly so we don&amp;#039;t want an exception&lt;br /&gt;
   http_error_400 = http_error_401 = http_error_402 = http_error_403 \&lt;br /&gt;
   = http_error_404 = http_error_304&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 class PasswordDb(urllib2.HTTPPasswordMgr):&lt;br /&gt;
   &amp;#039;&amp;#039;&amp;#039;A very simple password store.  The user can supply usernames using the&lt;br /&gt;
   -p flag on the command line, and will be prompted for the password for&lt;br /&gt;
   each username.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
   &lt;br /&gt;
   def __init__(self):&lt;br /&gt;
     self.passwords = []  # [ [substring, uid, pw], [substring, uid, pw] ]&lt;br /&gt;
   &lt;br /&gt;
   def Populate(self, options):&lt;br /&gt;
     &amp;#039;&amp;#039;&amp;#039;Given an options object as used by Kongulo, ask the user for the&lt;br /&gt;
     password for each user-id/substring-of-domain that the user provided using&lt;br /&gt;
     the -p flag.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
     if not options.pw:&lt;br /&gt;
       return&lt;br /&gt;
     &lt;br /&gt;
     for item in options.pw.split(&amp;#039;,&amp;#039;):&lt;br /&gt;
       (uid, substring) = item.split(&amp;#039;@&amp;#039;)&lt;br /&gt;
       pw = getpass.getpass(&amp;#039;Enter password for %s: &amp;#039; % item)&lt;br /&gt;
       self.passwords.append([substring, uid, pw])&lt;br /&gt;
     &lt;br /&gt;
   def find_user_password(self, *args, **kw):&lt;br /&gt;
     for passdata in self.passwords:&lt;br /&gt;
       for name in args:&lt;br /&gt;
         if name.find(passdata[0]) != -1:&lt;br /&gt;
           return (passdata[1], passdata[2])&lt;br /&gt;
     print &amp;quot;!!! Need login info for (%s @ %s), consider using -p flag&amp;quot; % args&lt;br /&gt;
     return (None, None)&lt;br /&gt;
 &lt;br /&gt;
 passwords = PasswordDb()&lt;br /&gt;
 &lt;br /&gt;
 # A URL opener that can do basic and digest authentication, and never raises&lt;br /&gt;
 # exceptions for HTTP error codes we handle explicitly.&lt;br /&gt;
 opener = urllib2.build_opener(urllib2.HTTPBasicAuthHandler(passwords),&lt;br /&gt;
                               urllib2.HTTPDigestAuthHandler(passwords),&lt;br /&gt;
                               NoExceptionHandler())&lt;br /&gt;
 &lt;br /&gt;
 # To be a nice Internet citizen, we identify ourselves properly so that&lt;br /&gt;
 # whoever doesn&amp;#039;t like Kongulo can exclude us using robots.txt&lt;br /&gt;
 opener.addheaders = [(&amp;#039;User-agent&amp;#039;, &amp;#039;Kongulo v0.1 personal web crawler&amp;#039;)]&lt;br /&gt;
 &lt;br /&gt;
 # Should always be true on Windows systems.&lt;br /&gt;
 assert hasattr(opener.handlers[0],&lt;br /&gt;
                &amp;#039;proxies&amp;#039;), &amp;#039;ProxyHandler must be first handler.&amp;#039;&lt;br /&gt;
 # This parses Windows proxy registry settings&lt;br /&gt;
 opener.handlers[0].proxies = urllib.getproxies()&lt;br /&gt;
 &lt;br /&gt;
 class LenientRobotParser(robotparser.RobotFileParser):&lt;br /&gt;
   &amp;#039;&amp;#039;&amp;#039;Adds ability to parse robot files where same user agent is specified&lt;br /&gt;
   multiple times.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
   def __init__(self, url):&lt;br /&gt;
     &amp;#039;&amp;#039;&amp;#039;Setup internal state like RobotFileParser does.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
     robotparser.RobotFileParser.__init__(self)&lt;br /&gt;
     f = opener.open(url)&lt;br /&gt;
     lines = []&lt;br /&gt;
     line = f.readline()&lt;br /&gt;
     while line:&lt;br /&gt;
         lines.append(line.strip())&lt;br /&gt;
         line = f.readline()&lt;br /&gt;
     self.errcode = f.code&lt;br /&gt;
     if self.errcode == 401 or self.errcode == 403:&lt;br /&gt;
         self.disallow_all = 1&lt;br /&gt;
     elif self.errcode &amp;amp;gt;= 400:&lt;br /&gt;
         self.allow_all = 1&lt;br /&gt;
     elif self.errcode == 200 and lines:&lt;br /&gt;
         self.parse(lines)&lt;br /&gt;
     &lt;br /&gt;
   def parse(self, lines):&lt;br /&gt;
     &amp;quot;&amp;quot;&amp;quot;Strip repeated sequential definitions of same user agent, then&lt;br /&gt;
     call base&amp;#039;s parse method.&amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
     last_ua = &amp;#039;&amp;#039;&lt;br /&gt;
     modified_lines = []&lt;br /&gt;
     for line in lines:&lt;br /&gt;
       line&lt;br /&gt;
       if line.lower().startswith(&amp;#039;user-agent&amp;#039;):&lt;br /&gt;
         temp = last_ua&lt;br /&gt;
         last_ua = line.lower()&lt;br /&gt;
         if last_ua == temp:&lt;br /&gt;
           continue  # skip line&lt;br /&gt;
       if line.strip() == &amp;#039;&amp;#039;:&lt;br /&gt;
         last_ua = &amp;#039;&amp;#039;  # reset on blank line&lt;br /&gt;
       modified_lines += [line]&lt;br /&gt;
     &lt;br /&gt;
     robotparser.RobotFileParser.parse(self, modified_lines)&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 class UrlValidator:&lt;br /&gt;
   &amp;#039;&amp;#039;&amp;#039;An object that handles checking if we should fetch and crawl a specific&lt;br /&gt;
   URL.  This is based on the type of the URL (only crawl http URLs) and robot&lt;br /&gt;
   rules.  Maintains a cache of robot rules already fetched.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
   &lt;br /&gt;
   def __init__(self, match_url):&lt;br /&gt;
     self.robots = {}  # Dict of robot URLs to robot parsers&lt;br /&gt;
     self.match_url = re.compile(match_url)&lt;br /&gt;
   &lt;br /&gt;
   def IsCrawlable(self, url):&lt;br /&gt;
     &amp;quot;&amp;quot;&amp;quot;Returns true if it&amp;#039;s OK to crawl the absolute URL provided.&amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
     if not url.startswith(&amp;#039;http&amp;#039;) or not self.match_url.match(url):&lt;br /&gt;
       return 0&lt;br /&gt;
     return self.GetRules(url).can_fetch(&amp;#039;*&amp;#039;, url)&lt;br /&gt;
   &lt;br /&gt;
   def GetRules(self, url):&lt;br /&gt;
     &amp;quot;&amp;quot;&amp;quot;Returns the robot rules parser for &amp;#039;url&amp;#039;&amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
     robots_dir = urlparse.urljoin(url, &amp;quot;robots.txt&amp;quot;)  # First try dir-level&lt;br /&gt;
     if robots_dir in self.robots:&lt;br /&gt;
       return self.robots[robots_dir]&lt;br /&gt;
     robots_site = urlparse.urljoin(url, &amp;quot;/robots.txt&amp;quot;)  # Then the site-level&lt;br /&gt;
     if robots_site in self.robots:&lt;br /&gt;
       return self.robots[robots_site]&lt;br /&gt;
     &lt;br /&gt;
     # Inv: Our cache contains neither a dir-level nor site-level robots.txt file&lt;br /&gt;
     &lt;br /&gt;
     rules = LenientRobotParser(robots_dir)  # First try dir-level&lt;br /&gt;
     if hasattr(rules, &amp;#039;errcode&amp;#039;) and rules.errcode == 200:&lt;br /&gt;
       self.robots[robots_dir] = rules&lt;br /&gt;
     else:&lt;br /&gt;
       rules = LenientRobotParser(robots_site)  # Then try site-level&lt;br /&gt;
       self.robots[robots_site] = rules&lt;br /&gt;
     &lt;br /&gt;
     return rules&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 class Crawler:&lt;br /&gt;
   &amp;#039;&amp;#039;&amp;#039;This object holds the state of the crawl, and performs the crawl.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
   &lt;br /&gt;
   def __init__(self, options):    &lt;br /&gt;
     self.options = options  # Store the options provided&lt;br /&gt;
     self.rules = UrlValidator(options.match)  # Cache of robot rules etc.&lt;br /&gt;
     &lt;br /&gt;
     # Invariant of data:&lt;br /&gt;
     # - &amp;#039;tocrawl&amp;#039; is a list of items that we have or will crawl.  If we have&lt;br /&gt;
     #   never crawled them since we started, the item at index 2 in each&lt;br /&gt;
     #   crawlitem is None, otherwise it is a dictionary of headers,&lt;br /&gt;
     #   specifically the &amp;#039;If-Modified-Since&amp;#039; header, to prevent us from fetching&lt;br /&gt;
     #   this item in the next crawl if it hasn&amp;#039;t been modified.&lt;br /&gt;
     # - &amp;#039;scheduled&amp;#039; is a list of items we have already added to &amp;#039;tocrawl&amp;#039;&lt;br /&gt;
     #   (perhaps a premature optimization since we could just iterate over&lt;br /&gt;
     #   &amp;#039;tocrawl&amp;#039;)&lt;br /&gt;
     self.scheduled = sets.Set()&lt;br /&gt;
     # Format of this list is:&lt;br /&gt;
     # [[url1, depth1, { headername : headerval, ... } ], [url2, depth2], {}...]&lt;br /&gt;
     self.tocrawl = []&lt;br /&gt;
     &lt;br /&gt;
     # Fetch the entrypoint to the Google Desktop Search API.&lt;br /&gt;
 &lt;br /&gt;
   &lt;br /&gt;
   def ExtractLinks(self, baseurl, htmldoc):&lt;br /&gt;
     &amp;quot;&amp;quot;&amp;quot;Returns all anchors from the document with contents &amp;#039;htmldoc&amp;#039; at&lt;br /&gt;
     &amp;#039;baseurl&amp;#039; that are OK to crawl.&amp;quot;&amp;quot;&amp;quot;&lt;br /&gt;
     urls = []&lt;br /&gt;
     for match in itertools.chain(_LINK_RE.finditer(htmldoc),&lt;br /&gt;
                                  _FRAME_RE.finditer(htmldoc)):&lt;br /&gt;
       url = urlparse.urljoin(baseurl, match.group(2))&lt;br /&gt;
       if self.rules.IsCrawlable(url):&lt;br /&gt;
         urls += [url]&lt;br /&gt;
       else:&lt;br /&gt;
         print &amp;quot;    I %s&amp;quot; % url&lt;br /&gt;
     return urls&lt;br /&gt;
   &lt;br /&gt;
   def Crawl(self, baseurls):&lt;br /&gt;
     &amp;#039;&amp;#039;&amp;#039;Performs the crawl.&lt;br /&gt;
     &lt;br /&gt;
     Args:&lt;br /&gt;
       baseurls: [url1, url2, ...]&lt;br /&gt;
     &amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
     # Bootstrap our invariant of data&lt;br /&gt;
     for baseurl in baseurls:&lt;br /&gt;
       self.tocrawl.append([baseurl, self.options.depth, None])&lt;br /&gt;
     &lt;br /&gt;
     if self.options.loop:&lt;br /&gt;
       print &amp;quot;Running in loop mode - press Ctrl-C to stop.&amp;quot;&lt;br /&gt;
     &lt;br /&gt;
     while True:&lt;br /&gt;
       for crawlitem in self.tocrawl:&lt;br /&gt;
         (url, depth, headers) = crawlitem&lt;br /&gt;
         try:&lt;br /&gt;
           if headers:&lt;br /&gt;
             doc = opener.open(urllib2.Request(url, headers=headers))&lt;br /&gt;
           else:&lt;br /&gt;
             doc = opener.open(url)&lt;br /&gt;
           &lt;br /&gt;
           doctype = doc.info().type&lt;br /&gt;
           if doc.code == 304:  # not modified since last time&lt;br /&gt;
             print &amp;quot;--- (nomod) %s&amp;quot; % url&lt;br /&gt;
           elif (doc.code == 200 and doctype == &amp;#039;text/html&amp;#039; or&lt;br /&gt;
                 doctype == &amp;#039;text/plain&amp;#039;):&lt;br /&gt;
             print &amp;quot;::: (%d) %s&amp;quot; % (depth, url)&lt;br /&gt;
             &lt;br /&gt;
             # Store last modified in the crawlitem&lt;br /&gt;
             # Prefer Last-Modified header, then Date header (to get same&lt;br /&gt;
             # formatting as used by the server), then current date in&lt;br /&gt;
             # appropriate format.&lt;br /&gt;
             last_modified = None&lt;br /&gt;
             if &amp;#039;last_modified&amp;#039; in doc.headers:&lt;br /&gt;
               last_modified = fdoc.headers[&amp;#039;last_modified&amp;#039;]&lt;br /&gt;
             elif &amp;#039;date&amp;#039; in doc.headers:&lt;br /&gt;
               last_modified = doc.headers[&amp;#039;date&amp;#039;]&lt;br /&gt;
             else:&lt;br /&gt;
               last_modified = email.Utils.formatdate(time.time(), usegmt=True)&lt;br /&gt;
             crawlitem[2] = { &amp;#039;If-Modified-Since&amp;#039; : last_modified }&lt;br /&gt;
             &lt;br /&gt;
             content = doc.read()&lt;br /&gt;
             &lt;br /&gt;
             # Create a GDS event, populate its fields, and send it off to have&lt;br /&gt;
             # the web page added to the Google Desktop Search index.&lt;br /&gt;
             &lt;br /&gt;
             #event.AddProperty(&amp;#039;format&amp;#039;, doctype)&lt;br /&gt;
             #event.AddProperty(&amp;#039;content&amp;#039;, content)&lt;br /&gt;
             #event.AddProperty(&amp;#039;uri&amp;#039;, url)&lt;br /&gt;
             fout = file(&amp;quot;output.txt&amp;quot;,&amp;#039;w&amp;#039;);&lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout,&amp;quot;\n****doctype********\n&amp;quot; &lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout, doctype&lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout,&amp;quot;\n****content********\n&amp;quot;&lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout, content&lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout,&amp;quot;\n******url******\n&amp;quot;&lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout, url&lt;br /&gt;
 &lt;br /&gt;
             # TODO Use the last-modified HTTP header instead of current time&lt;br /&gt;
             # if available.&lt;br /&gt;
             #event.AddProperty(&amp;#039;last_modified_time&amp;#039;,&lt;br /&gt;
             #                  pywintypes.Time(time.time() + time.timezone))&lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout,&amp;quot;\n*****time*********\n&amp;quot;            &lt;br /&gt;
             print &amp;amp;gt;&amp;amp;gt; fout, pywintypes.Time(time.time() + time.timezone)&lt;br /&gt;
 &lt;br /&gt;
             &lt;br /&gt;
             if doctype == &amp;#039;text/html&amp;#039;:  # no links in text documents&lt;br /&gt;
               title_match = _TITLE_RE.search(content)&lt;br /&gt;
               if title_match:&lt;br /&gt;
                 title = title_match.group(1)&lt;br /&gt;
                 #event.AddProperty(&amp;#039;title&amp;#039;, title)&lt;br /&gt;
                 print &amp;amp;gt;&amp;amp;gt; fout, &amp;quot;\n*****title*******\n&amp;quot;&lt;br /&gt;
                 print &amp;amp;gt;&amp;amp;gt; fout, title&lt;br /&gt;
               &lt;br /&gt;
               for link in self.ExtractLinks(doc.geturl(), content):&lt;br /&gt;
                 if depth &amp;amp;gt; 0 and not link in self.scheduled:&lt;br /&gt;
                   self.scheduled.add(link)&lt;br /&gt;
                   self.tocrawl.append([link, depth - 1, None])&lt;br /&gt;
   &lt;br /&gt;
             # Don&amp;#039;t use historical flag, because if we do, GDS will &amp;quot;throttle&amp;quot;&lt;br /&gt;
             # the events we send, not returning until the user becomes idle.&lt;br /&gt;
             # We also want to ensure the page is updated in the cache (in case&lt;br /&gt;
             # the user already visited it herself using a browser).&lt;br /&gt;
             #event.Send(0x01)&lt;br /&gt;
           else:&lt;br /&gt;
             print &amp;quot;!!! (HTTP %d) %s&amp;quot; % (doc.code, url)&lt;br /&gt;
     &lt;br /&gt;
           doc.close()&lt;br /&gt;
         except IOError:&lt;br /&gt;
           print &amp;quot;!!! (nolink) %s&amp;quot; % url&lt;br /&gt;
         except ValueError:&lt;br /&gt;
           print &amp;quot;!!! (noauth) %s&amp;quot; % url&lt;br /&gt;
       &lt;br /&gt;
       if not self.options.loop:&lt;br /&gt;
         break&lt;br /&gt;
       else:&lt;br /&gt;
         print (&amp;quot;=== Completed crawl; will recrawl in %d minutes.&amp;quot; %&lt;br /&gt;
                (self.options.sleep))&lt;br /&gt;
         time.sleep(60 * self.options.sleep)&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 def Main():&lt;br /&gt;
   &amp;#039;&amp;#039;&amp;#039;This function contains the logic for the command-line UI for Kongulo.&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
   &lt;br /&gt;
   # Set up options and parse arguments.&lt;br /&gt;
   parser = optparse.OptionParser(usage=&amp;#039;%prog [options] BASEURL1 BASEURL2 ...&amp;#039;)&lt;br /&gt;
   parser.add_option(&amp;#039;-d&amp;#039;, &amp;#039;--depth&amp;#039;, type=&amp;#039;int&amp;#039;, dest=&amp;#039;depth&amp;#039;, default=0,&lt;br /&gt;
                     help=&amp;#039;How deep to follow links from BASEURLs (default 0, &amp;#039;&lt;br /&gt;
                          &amp;#039;suggest max 5-6)&amp;#039;)&lt;br /&gt;
   parser.add_option(&amp;#039;-m&amp;#039;, &amp;#039;--match&amp;#039;, dest=&amp;#039;match&amp;#039;, default=&amp;#039;.+&amp;#039;,&lt;br /&gt;
                     help=r&amp;#039;Regular expression that URLs must match if they are &amp;#039;&lt;br /&gt;
                     &amp;#039;to be crawled, e.g. &amp;quot;.+intranet\.smurfgeburf\.com.+&amp;quot; to &amp;#039;&lt;br /&gt;
                     &amp;#039;stay within the Smurfgeburf intranet&amp;#039;)&lt;br /&gt;
   parser.add_option(&amp;#039;-l&amp;#039;, &amp;#039;--loop&amp;#039;, action=&amp;#039;store_true&amp;#039;, dest=&amp;#039;loop&amp;#039;,&lt;br /&gt;
                     default=False, help=&amp;#039;If this flag is given, Kongulo will &amp;#039;&lt;br /&gt;
                     &amp;#039;keep fetching the specified page and pages it points to.  &amp;#039;&lt;br /&gt;
                     &amp;#039;It will not refetch pages that haven&amp;#039;t changed.&amp;#039;)&lt;br /&gt;
   parser.add_option(&amp;#039;-s&amp;#039;, &amp;#039;--sleep&amp;#039;, type=&amp;#039;int&amp;#039;, dest=&amp;#039;sleep&amp;#039;, default=60,&lt;br /&gt;
                     help=&amp;#039;Number of minutes to sleep before looping (default &amp;#039;&lt;br /&gt;
                     &amp;#039;60). Only valid if -l is also specified.&amp;#039;)&lt;br /&gt;
   parser.add_option(&amp;#039;-p&amp;#039;, &amp;#039;--passwords&amp;#039;, dest=&amp;#039;pw&amp;#039;,&lt;br /&gt;
                     help=&amp;#039;Comma-delimited list of user IDs at names that will &amp;#039;&lt;br /&gt;
                     &amp;#039;be matched as substrings against the domain or &amp;quot;region&amp;quot; &amp;#039;&lt;br /&gt;
                     &amp;#039;that a password is needed for, e.g. &amp;#039;&lt;br /&gt;
                     &amp;#039;&amp;quot;joi@google.com,admin@192.168.250.1,snafu@slashdot.org&amp;quot;.  &amp;#039;&lt;br /&gt;
                     &amp;#039;You will be prompted for each password.&amp;#039;)&lt;br /&gt;
   (options, args) = parser.parse_args()&lt;br /&gt;
   if len(args) &amp;amp;lt; 1:&lt;br /&gt;
     parser.error(&amp;#039;Provide at least one base URL&amp;#039;)&lt;br /&gt;
   &lt;br /&gt;
 #  try:&lt;br /&gt;
 #    obj = win32com.client.Dispatch(&amp;#039;GoogleDesktopSearch.Register&amp;#039;)&lt;br /&gt;
 #  except pythoncom.ole_error:&lt;br /&gt;
 #    print (&amp;#039;ERROR: You need to install Google Desktop Search to be able to &amp;#039;&lt;br /&gt;
 #           &amp;#039;use Kongulo.&amp;#039;)&lt;br /&gt;
  #   sys.exit(2)&lt;br /&gt;
   &lt;br /&gt;
 #  try:&lt;br /&gt;
     # Register with GDS.  This is a one-time operation and will return an&lt;br /&gt;
     # error if already registered.  We cheat and just catch the error and&lt;br /&gt;
     # do nothing.&lt;br /&gt;
  #   obj.RegisterComponent(_GUID,&lt;br /&gt;
  #            [&amp;#039;Title&amp;#039;, &amp;#039;Kongulo&amp;#039;, &amp;#039;Description&amp;#039;, &amp;#039;A simple web spider that &amp;#039;&lt;br /&gt;
  #             &amp;#039;lets you keep copies of web sites in your Google Desktop Search &amp;#039;&lt;br /&gt;
  #             &amp;#039;index.&amp;#039;, &amp;#039;Icon&amp;#039;, &amp;#039;%SystemRoot%\system32\SHELL32.dll,134&amp;#039;])&lt;br /&gt;
     &lt;br /&gt;
     # TODO Provide an unregistration mechanism.&lt;br /&gt;
  # except:&lt;br /&gt;
     # TODO narrow to only the error that GDS returns when component&lt;br /&gt;
     # already registered&lt;br /&gt;
  #   pass&lt;br /&gt;
 &lt;br /&gt;
   passwords.Populate(options)&lt;br /&gt;
   Crawler(options).Crawl(args)&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
 if __name__ == &amp;#039;__main__&amp;#039;:&lt;br /&gt;
   Main()&lt;/div&gt;</summary>
		<author><name>imported&gt;Unknown</name></author>
	</entry>
</feed>