#this line is line 481 in searchengineWithoutTry.py
def crawl(self,pages,depth=2):
for i in range(depth):
newpages={}
for page in pages:
#try:
c=urllib2.urlopen(page)
#except:
print "Could not open %s" % page
continue
#try:
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages[url]=1
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit()
#except Exception, e:
print "Could not parse page %s" % page
print 'The exception is: ', e
pages=newpages
|