UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 15-16: unexpected end of data

93 Views Asked by At
SOURCE_VOCAB_SIZE = 37_000
TARGET_VOCAB_SIZE = 37_000
DATA_FNAME = 'D:/GPU6/spa-eng/sha5.txt'
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
def load_data(fname):
    # open the file with utf-8 encoding
    with open(fname, "r", encoding="utf-8",errors= 'ignore') as textFile:
        # the source and the target sentence is demarcated with tab,
        # iterate over each line and split the sentences to get
        # the individual source and target sentence pairs
        lines = textFile.readlines()
        pairs = [line.rstrip('\n').split(",") for line in lines]
        # randomly shuffle the pairs
        random.shuffle(pairs)
        # collect the source sentences and target sentences into
        # respective lists
        source = [src for src, _ in pairs]
        target = [trgt for _, trgt in pairs]
    # return the list of source and target sentences
    return (source, target)

def splitting_dataset(source, target):
    # calculate the training and validation size
    trainSize = int(len(source) * 0.6)
    valSize = int(len(source) * 0.2)
    # split the inputs into train, val, and test
    (trainSource, trainTarget) = (source[:trainSize], target[:trainSize])
    (valSource, valTarget) = (
        source[trainSize : trainSize + valSize],
        target[trainSize : trainSize + valSize],
    )
    (testSource, testTarget) = (
        source[trainSize + valSize :],
        target[trainSize + valSize :],
    )
    # return the splits
    return (
        (trainSource, trainTarget),
        (valSource, valTarget),
        (testSource, testTarget),
    )
def tf_split_punct1(text):
    # split accented characters
   
    text = tf_text.normalize_utf8(text, "NFKD")
    
    # Remove Sharda digits
    text = tf.strings.regex_replace(text, "[^ -.।॥,]", "")
    text = tf.strings.regex_replace(text, '', '')
    text = tf.strings.regex_replace(text, '', '')
    text = tf.strings.regex_replace(text, '[]', '')
   # text = tf.strings.unicode_decode(text, input_encoding='UTF-8')
    # strip whitespace and add [START] and [END] tokens
    text = tf.strings.strip(text)
    text = tf.strings.join(["[START]", text, "[END]"], separator=" ")
    
    # return the processed text
    return text

def tf_split_punct2(text):
    # split accented characters
   
    text = tf_text.normalize_utf8(text, "NFKD")
   
     # Remove double danda punctuation
    text = tf.strings.regex_replace(text, "[^ अ-ह.।॥,]", "")
    text = tf.strings.regex_replace(text, '॥', '')
    text = tf.strings.regex_replace(text, '', '')
    # Remove digits
    text = tf.strings.regex_replace(text, '[०१२३४५६७८९]', '')
    #text = tf.strings.unicode_decode(text, input_encoding='UTF-8')
    # strip whitespace and add [START] and [END] tokens
    text = tf.strings.strip(text)
    text = tf.strings.join(["[START]", text, "[END]"], separator=" ")
   
    # return the processed text
    return text

print("[INFO] loading data from {DATA_FNAME}...")
(source, target) = load_data(fname=DATA_FNAME)
print("[INFO] splitting the dataset into train, val, and test...")
(train, val, test) = splitting_dataset(source=source, target=target)
seq_length = 20
# create source text processing layer and adapt on the training
# source sentences
print("[INFO] adapting the source text processor on the source dataset...")
sourceTextProcessor = TextVectorization(
    standardize=tf_lower_and_split_punct1, max_tokens=SOURCE_VOCAB_SIZE, split="whitespace"
)
sourceTextProcessor.adapt(train[0])
sourceTextProcessor.get_vocabulary()
# create target text processing layer and adapt on the training
# target sentences
print("[INFO] adapting the target text processor on the target dataset...")
targetTextProcessor = TextVectorization(
    standardize=tf_lower_and_split_punct2, max_tokens=TARGET_VOCAB_SIZE, split="whitespace"
)
targetTextProcessor.adapt(train[1])

When this code is used in machine translation using transformer model then it gives the error : UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 15-16: unexpected end of data.

Here textfile contains Sharada script as source and Devanagari script as target. Please give the solution. When we run function tf_split_punct1 separately to remove punctuation from English text it gives same English text without punctuation but when we do the same to remove punctuation from Devanagari or Sharada text it gives this:

tf.Tensor(b'\xf0\x91\x86\xa0\xf0\x91\x87\x80\xf0\x91\x86\xae\xf0\x91\x86\xb3\xf0\x91\x86\x81 \xf0\x91\x86\xa9\xf0\x91\x86\xa4\xf0\x91\x86\xb1\xf0\x91\x86\xb3\xf0\x91\x86\xa2\xf0\x91\x86\xa3\xf0\x91\x86\xb3\xf0\x91\x86\xa9\xf0\x91\x86\xb4 \xf0\x91\x86\xa8\xf0\x91\x86\x93\xf0\x91\x86\xae\xf0\x91\x86\xa2\xf0\x91\x87\x80\xf0\x91\x86\xae\xf0\x91\x86\xb5\xf0\x91\x86\xa0\xf0\x91\x86\xbc \xf0\x91\x86\xa8\xf0\x91\x86\xae\xf0\x91\x86\xa2\xf0\x91\x87\x80\xf0\x91\x86\xae\xf0\x91\x86\xbc\xf0\x91\x86\xb0\xf0\x91\x86\xb4\xf0\x91\x86\x9f\xf0\x91\x86\xb5\xf0\x91\x86\xa9\xf0\x91\x87\x80 \xf0\x91\x87\x86\xf0\x91\x87\x86 \xf0\x91\x86\xa4\xf0\x91\x86\xa9\xf0\x91\x86\xbe\xf0\x91\x86\xb1\xf0\x91\x87\x80\xf0\x91\x86\xa0\xf0\x91\x86\xb6 \xf0\x91\x86\xa0\xf0\x91\x86\xbc', shape=(), dtype=string) 
0

There are 0 best solutions below