How to validate a word in python?

1k Views Asked by At

I have a list in Python like this:

`list = ['thatCreation', 'happeningso', '’', 'comebecause',]

Question :

I want specific words:

For e.g. -> 'thatCreation' -> 'that', 'creation'
            'happeningso' -> 'happening', 'so'
            'comebeacause' -> 'come', 'because' `

Thanks in advance for solving it in python.

3

There are 3 best solutions below

5
bittermelonman On

It looks like you are trying to take words merged together in camel case and break it apart. There is a great algorithm called Viterbi that does this really well.

I can't explain the magic behind it, but I implemented it in my program recently and it works really well. My understanding is it calculates the probability of each word and splits on that. This algorithm can split words in any case.

def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower()) 
dictionary = Counter(words(open(words_path).read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

sentence = ' '.join(viterbi_segment('thatCreation'.lower())[0])
print('sentence: {0}'.format(sentence))
word = ''.join(a.capitalize() for a in split('([^a-zA-Z0-9])', sentence)
       if a.isalnum())
print('word: {0}'.format(word[0].lower() + word[1:]))

You need a dictionary of a ton of words, there are multiple out there, but I used: https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt

and updated it with new words that it didn't have.

2
pankaj giri On

Borrowed from Peter Norvig's pytudes to perform word segmentation. Please try..

import re
import math
import random
import matplotlib.pyplot as plt
from collections import Counter
from itertools   import permutations
from typing      import List, Tuple, Set, Dict, Callable

!wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt

Word = str    # We implement words as strings
cat = ''.join # Function to concatenate strings together


def tokens(text) -> List[Word]:
    """List all the word tokens (consecutive letters) in a text. Normalize to lowercase."""
    return re.findall('[a-z]+', text.lower()) 

TEXT = open('big.txt').read()
WORDS = tokens(TEXT)


class ProbabilityFunction:
    def __call__(self, outcome):
        """The probability of `outcome`."""
        if not hasattr(self, 'total'):
            self.total = sum(self.values())
        return self[outcome] / self.total
    
class Bag(Counter, ProbabilityFunction): """A bag of words."""
    

Pword = Bag(WORDS)


def Pwords(words: List[Word]) -> float:
    "Probability of a sequence of words, assuming each word is independent of others."
    return Π(Pword(w) for w in words)

def Π(nums) -> float:
    "Multiply the numbers together.  (Like `sum`, but with multiplication.)"
    result = 1
    for num in nums:
        result *= num
    return result

def splits(text, start=0, end=20) -> Tuple[str, str]:
    """Return a list of all (first, rest) pairs; start <= len(first) <= L."""
    return [(text[:i], text[i:]) 
            for i in range(start, min(len(text), end)+1)]

def segment(text) -> List[Word]:
    """Return a list of words that is the most probable segmentation of text."""
    if not text: 
        return []
    else:
        candidates = ([first] + segment(rest)
                      for (first, rest) in splits(text, 1))
        return max(candidates, key=Pwords)

strings = ['thatCreation', 'happeningso', 'comebecause']
[segment(string.lower()) for string in strings]

--2020-08-04 18:48:06-- https://raw.githubusercontent.com/dwyl/english-words/master/words.txt Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 4863005 (4.6M) [text/plain] Saving to: ‘words.txt.2’

words.txt.2 100%[===================>] 4.64M 162KB/s in 25s

2020-08-04 18:48:31 (192 KB/s) - ‘words.txt.2’ saved [4863005/4863005]

[['that', 'creation'], ['happening', 'so'], ['come', 'because']]

0
Rohan Patel On
import re
from collections import Counter

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                    for j in range(max(0, i - max_word_length), i))
    probs.append(prob_k)
    lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]
    

def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower())   
dictionary = Counter(words(open('big.txt').read()))
max_word_length = max(map(len, dictionary))  
total = float(sum(dictionary.values()))
l = ['thatCreation', 'happeningso', 'comebecause',]

for w in l:
    print(viterbi_segment(w.lower()))

O/p will be - 
(['that', 'creation'], 1.63869514118246e-07)
(['happening', 'so'], 1.1607123777400279e-07)
(['come', 'because'], 4.81658105705814e-07)

I got a solution to my problem from @Darius Bacon and for this, you need to make all strings a lowercase string. Thank You Guys for your help.

Visit this link for download big.txt : https://norvig.com/big.txt