How to fix" xrange() arg 3 must not be zero" error in python using parallel programming?

4k Views Asked by At
import time
from multiprocessing import Process, Pool
import sys, os, inspect
import urllib
import re
index ={}
graph={}
# Common words that we don't want to be part of the index
g=['is','a','the','ga','all','to','under']


def rm_tag(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def get_page(url):
    try:
        f = urllib.urlopen(url)
        page = f.read()
        f.close()
        return page
    except:
        return ""

def union(a,b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_url(page):
    start_link=page.find("<a href=")
    if(start_link==-1):
        return None,0
    start_quote=page.find('"',start_link)
    end_quote=page.find('"',start_quote+1)
    url=page[start_quote+1:end_quote]
    return url,end_quote

def get_all_links(page):
    links=[]
    while True:
        url,endpos=get_next_url(page)
        page=page[endpos:]
        if url:
            links.append(url)
        else:
            break
    print "get_all_links: %i links found\n" % len(links)
    graph[page]=[links]
    return graph[page]


def add_to_index(index,url,keyword):
        if keyword in index:
                if url not in index[keyword]:
                        index[keyword].append(url)
                return

        global g
        if keyword not in g:
          index[keyword]=[url]

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, url,word)

def compute_ranks(graph):
    d=0.8
    numloops=20
    ranks={}
    npages=len(graph)
    for page in graph:
        ranks[page]=1.0/npages
    for i in range(0,numloops):
        newranks={}
        for page in graph:
            newrank=(1-d)/npages
            for node in graph:
                if page in graph[node]:
                    newrank=newrank + d * (ranks[node]/len(graph[node]))
            newranks[page]=newrank
        ranks=newranks
    return ranks
def Look_up(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None

def Look_up_new(index,ranks,keyword):
    pages=Look_up(index,keyword)
    if pages:
        for i in pages:
            print  '\n Results with Page Ranks :\n'+i+" --> "+str(ranks[i])
    else:
        print "Keyword does not exist"

def lucky_search(index, ranks, keyword):
    try:
        pages = Look_up(index, keyword)
        if pages:
            bestpage = pages[0]
            for candidate in pages:
                if ranks[candidate] > ranks[bestpage]:
                    bestpage = candidate
            return (bestpage, ranks[bestpage], pages)
    except:
        print "Keyword does not exist",sys.exc_info()[0]
        return None


def print_profile(index, graph):
    print "*****************"
    print "Length of index", len(index)
    print "Length of graph", len(graph)

    i = 0
    for e in index:
        i = i + 1
        print i, ":", e
        if i > 20:
            break
    print "*****************"

def print_profile_top (index, ranks):
    max = 0
    for e in index:
        link_count = len(index[e])
        if link_count > max:
            max = link_count   # updating the highest no of links we have found so far
            print e, link_count, lucky_search(index,ranks,e)[1]
    print "*********************"


#print "result for :", search_term, ": ", lucky_search(index,ranks,search_term)
#print lucky_search(index,ranks,'limited')

def chunks(l, n):
    for i in xrange(0,len(l),n):
        yield l[i:i+n] # instantly makes chunks a "generator function" instead of a normal function

if __name__ == '__main__':
    start = time.clock()    
    c = 3
    seed= "http://www.python.org"
    keyword = "CGI"
    max_page = 20
    tocrawl=[seed]
    crawled =[]
    print '\nCrawling using ' + seed + " as seed_page and search_term: " + keyword
    pool = Pool (c)
    while tocrawl:
        page=tocrawl.pop()
        if page not in crawled:
            max_page -= 1 
            if max_page<=0:
                break
            content=get_page(page)
            text=content
            partitioned_text= list(chunks(text, len(text) / c))
            links=pool.map(get_all_links, partitioned_text)
            #links=get_all_links(content)
            #content=rm_tag(content)
            add_page_to_index(index,page,content)
            for e in links:
                if e not in tocrawl:
                    tocrawl.append(e)
            #union(tocrawl,graph[page])
            crawled.append(page)

    #pool.terminate()
    ranks=compute_ranks(graph)
    print_profile(index, graph)
    print_profile_top(index, ranks)
    print "result for :", keyword, ": ", lucky_search(index,ranks,keyword)
    t=time.clock() - start
    print "Processing Time :",t
#print crawled,index,graph """

while running the code the following error shows. please help me to fix.

Traceback (most recent call last):
  File "C:\Documents and Settings\priyanka.14790\My Documents\Dropbox\Udacity\parallel.py", line 250, in <module>
    partitioned_text= list(chunks(text, len(text) / c))
  File "C:\Documents and Settings\priyanka.14790\My Documents\Dropbox\Udacity\parallel.py", line 229, in chunks
    for i in xrange(0,len(l),n):
ValueError: xrange() arg 3 must not be zero

Here is the code for normal search engine without any error

    import sys, os, inspect
    import urllib
    import re
    max_page=5

    # Common words that we don't want to be part of the index
    g=['is','a','the','ga','all','to','under']

    def rm_tag(data):
        p = re.compile(r'<.*?>')
        return p.sub('', data)

    def get_page(url):
        try:
            f = urllib.urlopen(url)
            page = f.read()
            f.close()
            return page
        except:
            return ""

    def union(a,b):
        for e in b:
            if e not in a:
                a.append(e)

    def get_next_url(page):
        start_link=page.find("<a href=")
        if(start_link==-1):
            return None,0
        start_quote=page.find('"',start_link)
        end_quote=page.find('"',start_quote+1)
        url=page[start_quote+1:end_quote]
        return url,end_quote

    def get_all_links(page):
        links=[]
        while True:
            url,endpos=get_next_url(page)
            page=page[endpos:]
            if url:
                links.append(url)
            else:
                break
        return links


    def add_to_index(index,url,keyword):
            if keyword in index:
                    if url not in index[keyword]:
                            index[keyword].append(url)
            index[keyword]=[url]



    def add_page_to_index(index, url, content):
        words = content.split()
        for word in words:
            add_to_index(index, url,word)

    def compute_ranks(graph):
        d=0.8
        numloops=20
        ranks={}
        npages=len(graph)
        for page in graph:
            ranks[page]=1.0/npages
        for i in range(0,numloops):
            newranks={}
            for page in graph:
                newrank=(1-d)/npages
                for node in graph:
                    if page in graph[node]:
                        newrank=newrank + d * (ranks[node]/len(graph[node]))
                newranks[page]=newrank
            ranks=newranks
        return ranks

    def crawl_web(seed):
        tocrawl=[seed]
        crawled =[]
        index ={}
        graph={}
        global max_page
        while tocrawl:
            page=tocrawl.pop()
            if page not in crawled:
                max_page -= 1 
                if max_page<=0:
                    break
                c = get_page(page)
                graph[page]=get_all_links(c)
                c=rm_tag(c)
                add_page_to_index(index,page,c)
                union(tocrawl,graph[page])
                crawled.append(page)
        return crawled,index,graph


    def Look_up(index, keyword):
        if keyword in index:
            return index[keyword]
        else:
            return None

    def lucky_search(index, ranks, keyword):
        try:
            pages = Look_up(index, keyword)
            if pages:
                bestpage = pages[0]
                for candidate in pages:
                    if ranks[candidate] > ranks[bestpage]:
                        bestpage = candidate
                return (bestpage, ranks[bestpage], pages)
        except:
            print "Keyword does not exist"


    seed_page = "http://www.python.org"
    search_term = "Take"

    try:
        print "Enter the Max Limit to Search :"
        max_limit=int(raw_input())
    except:
        max_page = 10

    max_page = max_limit

    print '\nCrawling using ' + seed_page + " as seed_page and search_term: " + search_term
    crawled,index,graph=crawl_web(seed_page)
    ranks=compute_ranks(graph)

    def print_profile(index, graph):
        print "*****************"
        print "Length of index", len(index)
        print "Length of graph", len(graph)
        i = 0
        for e in index:
            i = i + 1
            print i, ":", e
            if i > 20:
                break
        print "*****************"

    def print_profile_top (index, ranks):
        max1 = 0
        for e in index:
            link_count = len(index[e])
            if link_count > max1:
                max1= link_count 
                print e, link_count, lucky_search(index,ranks,e)[1]
        print "*********************"

    print_profile(index, graph)
    print_profile_top(index, ranks)

    print "result for :", search_term, ": ", lucky_search(index,ranks,search_term)

the output is : Enter the Max Limit to Search : 10

    Crawling using http://www.python.org as seed_page and search_term: Take
    *****************
    Length of index 1281
    Length of graph 9
    1 : Canada
    2 : limited
    3 : all
    4 : here"-->.
    5 : unclear,
    6 : CGI,
    7 : 08:00
    8 : enabled:
    9 : results
    10 : href=&quot;&quot;
    11 : :/
    12 : subtle
    13 : Take
    14 : Buildbot,
    15 : pyBiblio,
    16 : CD&#8221;,
    17 : href="/search-pymodules.xml"/>
    18 : nothing
    19 : Foundation
    20 : pyArkansas
    21 : depend
    *****************
    Canada 1 0.0222222222222
    *********************
    result for : Take :  ('http://www.timparkin.co.uk/2012/08/why-you-cant-make-digital-look-like-velvia-50/', 0.022821308980213083, ['http://www.timparkin.co.uk/2012/08/why-you-cant-make-digital-look-like-velvia-50/'])
    >>> 

Please run and see the difference.

2

There are 2 best solutions below

4
On

Seems like your text is empty. Since you have except: return "" in get_page, anything could've happened (a dead link maybe). Add some logging to get_page.

0
On

Just avoid exactly what the Error is telling you.

In [6]: len(text)
Out[6]: 0

In [7]: c
Out[7]: 3

In [8]: 0 / 3
Out[8]: 0

The length of text, being an empty string is returning 0.

A possible fix would be catching the Error and setting len(text) to c.