#!/usr/bin/python
import
urllib
import
re
from
sgmllib
import
SGMLParser
class
URLLister(SGMLParser):
def
reset(
self
):
self
.urls
=
[]
SGMLParser.reset(
self
)
def
start_a(
self
, attrs):
href
=
[v
for
k, v
in
attrs
if
k
=
=
'href'
]
if
href:
self
.urls.extend(href)
def
doUrls(url):
print
"hehe\tURL:\t"
+
url
def
crawlWeb(web_url):
web_url_s
=
web_url.split(
"/"
)
doUrls(web_url)
lenW
=
len
(web_url_s)
ToCrawlUrls
=
[]
ToCrawlUrls_temp
=
[]
BeCrawledUrls
=
[]
ToCrawlUrls.append(web_url)
while
(
len
(ToCrawlUrls) !
=
0
):
#print ToCrawlUrls
#print "\t\tok"
for
i_url
in
ToCrawlUrls:
#print "parsing\t" + i_url
raw_html
=
urllib.urlopen(i_url).read()
lister
=
URLLister()
lister.feed(raw_html)
temp_urls
=
lister.urls
for
j_url
in
temp_urls:
if
(re.search(r
"http:"
,j_url)!
=
None
and
(re.search(r'html
$
',j_url)!=None or re.search(r'
htm$',j_url)!
=
None
)):
j_url
=
j_url
elif
(re.search(r
'html$'
,j_url)!
=
None
or
re.search(r'htm
$',j_url)!
=
None
):
if
(re.search(r
"^\."
,j_url)
=
=
None
):
j_url
=
i_url
+
j_url
else
:
continue
else
:
continue
j_url_s
=
j_url.split(
"/"
)
lenJ
=
len
(j_url_s)
if
(BeCrawledUrls.count(j_url)
=
=
0
):
if
(lenJ >
=
lenW):
#print "\t"+j_url+"\n\t"+i_url
if
(re.search(i_url,j_url)!
=
None
):
ToCrawlUrls_temp.append(j_url)
doUrls(j_url)
BeCrawledUrls.append(i_url)
ToCrawlUrls
=
ToCrawlUrls_temp
ToCrawlUrls_temp
=
[]