I'm writing a parallel crawler using Python and I'm storing some information in Mongodb. After testing I realized that my code, even though is using threading, is not parallel. It make no difference whether I use a single thread or 10 or 50 threads. I can't figure out why.
EDIT: From what I can see most of the processing time is taken up by soup = BeautifulSoup(html)
. Could it be that this command can't get parallelized using threads?
from threading import Thread
import Queue
import urllib2
import re
from BeautifulSoup import *
from urlparse import urljoin
from pymongo import MongoClient
from urlparse import urlparse
import time
import hashlib
start_time = time.time()
level = 1
client = MongoClient()
db = client.crawler
visited = {}
def doWork():
while True:
try:
myUrl = q_start.get()
except:
continue
try:
c=urllib2.urlopen(myUrl)
except:
q_start.task_done()
continue
parsed_url = urlparse(myUrl)
html=c.read()
try:
soup = BeautifulSoup(html)
except:
q_start.task_done()
continue
txt = soup.prettify()
links = soup('a')
m = hashlib.md5(myUrl)
db.urls.insert(
{
"url":myUrl,
"HTML":txt,
"level":level,
"domain":parsed_url.netloc,
"md5":m.hexdigest()
}
)
for link in links:
if('href' in dict(link.attrs)):
url = urljoin(myUrl,link['href'])
if url.find("'")!=-1:
continue
url=url.split('#')[0]
if url[0:4] == 'http':
if url in visited:
continue
else:
visited[url]=True
q_new.put(url)
q_start.task_done()
q_start = Queue.Queue()
q_new = Queue.Queue()
for i in range(50):
t = Thread(target=doWork)
t.daemon = True
t.start()
q_start.put("http://google.com")
q_start.join()
for i in range(2,5):
print "Depth: "
print i
print time.time() - start_time
level += 1
print q_new.qsize()
q_aux = q_new
q_new = Queue.Queue()
while q_aux.empty() != True:
x = q_aux.get()
q_start.put(x)
q_start.join()
print "end"
print time.time() - start_time