No performance gain with python threading

137 Views Asked by At

I'm writing a parallel crawler using Python and I'm storing some information in Mongodb. After testing I realized that my code, even though is using threading, is not parallel. It make no difference whether I use a single thread or 10 or 50 threads. I can't figure out why.

EDIT: From what I can see most of the processing time is taken up by soup = BeautifulSoup(html). Could it be that this command can't get parallelized using threads?

from threading import Thread
import Queue
import urllib2
import re
from BeautifulSoup import *
from urlparse import urljoin
from pymongo import MongoClient
from urlparse import urlparse
import time
import hashlib

start_time = time.time()

level = 1
client = MongoClient()
db = client.crawler
visited = {}

def doWork():
    while True:
        try:
            myUrl = q_start.get()
        except:
            continue
        try:
            c=urllib2.urlopen(myUrl)
        except:
            q_start.task_done()
            continue

        parsed_url = urlparse(myUrl)

        html=c.read()
        try:
            soup = BeautifulSoup(html)
        except:
            q_start.task_done()

            continue
        txt = soup.prettify()
        links = soup('a')
        m = hashlib.md5(myUrl)

        db.urls.insert(
                {
                    "url":myUrl,
                    "HTML":txt,
                    "level":level,
                    "domain":parsed_url.netloc,
                    "md5":m.hexdigest()
                }
        )

        for link in links:

            if('href' in dict(link.attrs)):
                url = urljoin(myUrl,link['href'])
                if url.find("'")!=-1:
                    continue
                url=url.split('#')[0]
                if url[0:4] == 'http':
                    if url in visited:
                        continue
                    else:
                        visited[url]=True
                        q_new.put(url)
        q_start.task_done() 

q_start = Queue.Queue()

q_new = Queue.Queue()

for i in range(50):
        t = Thread(target=doWork)
        t.daemon = True
        t.start()

q_start.put("http://google.com")
q_start.join()

for i in range(2,5):
    print "Depth: "
    print i
    print time.time() - start_time
    level += 1
    print q_new.qsize()
    q_aux = q_new
    q_new = Queue.Queue()
    while q_aux.empty() != True:
        x = q_aux.get()
        q_start.put(x)
    q_start.join()

print "end"

print time.time() - start_time
0

There are 0 best solutions below