Here is the custom_standardize function I'm using for my task.
def custom_standardization(input_data):
# Lowercase the text and remove punctuation
stop_words = set(stopwords.words('english'))
lowercase = tf.strings.lower(input_data)
words = tf.strings.split(lowercase, r'\s+|[{}]'.format(re.escape(string.punctuation)))
stripped_html = tf.strings.regex_replace(words, "<br />", " ")
stripped_html = tf.strings.regex_replace(stripped_html,r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', ' ')
stripped_html = tf.strings.regex_replace(stripped_html, r'@([A-Za-z0-9_]+)', ' ' )
stripped_html = tf.strings.regex_replace(stripped_html, r'\b\w\b', ' ')
for i in stop_words:
stripped_html = tf.strings.regex_replace(stripped_html, f' {i} ', " ")
processed_text = tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation + string.digits), '')
return processed_text
vectorize_layer = tf.keras.layers.TextVectorization(
standardize=custom_standardization,
max_tokens=vocab_size,
output_mode='int',
output_sequence_length=None)
Most of the task is been efficiently done but I'm facing issues with words splitting. My output right now looks something like this.
['modifying', 'modify', 'modifiedprospective', 'mine', 'millionsyears', 'millionshares', 'millionand', 'michael','thoseapplicable', 'thistime', 'thisstatement', 'thissite', 'thisrequest', 'thisis', 'thisdecrease', 'thiscompetition', 'thiscase', 'thisassessment', 'thevariable', 'theutility', 'theultimate', 'thetreatment', 'thetransactions', 'theterms', 'thetermination', 'thetaxes', 'thetable', 'thesystematic', 'thestorage', 'thestatements', 'thestaffs', 'thespent', 'theseparately', 'theseestimates', 'theseagreements', 'thesarbannesoxley', 'theremeasurement', 'theregistration', 'thereforemanagement', 'therecovery', 'thereby', 'thereallocation', 'thereafterthe', 'theproceedings', 'thepossibility', 'thepositive', 'theplant', 'theperformance', 'theparticipant', 'theopportunity', 'theomission', 'theobligations', 'theobligation', 'thenotes', 'thenet', 'theminimum', 'themcv', 'themajority', 'thelevel', 'thelast', 'thelargest', 'theinterest', 'theinstructions', 'theinitial', 'theinformation', 'theimpact', 'theholders', 'thegreat', 'thegrantdate', 'theformation', 'thefoodservice', 'thefive', 'thefiscal', 'theferc', 'theextent', 'theestimate', 'theend', 'theelimination', 'theeffective', 'theearnings', 'thedomestic', 'thedevelopment', 'thecosts', 'thecontrols', 'thecontract', 'theborrowing', 'theblackscholesmerton', 'thebalance', 'theattorney', 'theassigning', 'thealjs', 'theadditional', 'theaccount', 'thatwere', 'thatutilities', 'thatunderwriters', 'thatthese', 'thattheir', 'thatsuch', 'thatset', 'thatrange']
How do I split these words? I experimented with adding
split='whitespace' or something similar as well
Can someone suggest what can I do solve this issue?