I recently asked a question about embedding data into an image. I promptly solved that issue with help from other forums. I have run into a new problem: my program works fine for all Latin characters and even ones with diacritical (accent) marks. My program fails for other unicode characters, such as thse from scripts like Cyrillic, Greek, and Arabic.
Here is the code for my current program that cannot do unicode.
def embed_text(self, image_path, text, output_path):
# Convert text message to binary format
binary_message = ''.join(format(ord(char), '08b') for char in text) #aligning the bytes for embedding the message
# Load the image
image = Image.open(image_path)
w, h = image.size
# Calculate the number of embedding characters (eN)
eN = (h * w * 3) // 8
if len(text) > eN:
raise ValueError("Message too long to fit in the image")
# Embedding loop
message_index = 0
for i in range(h):
for j in range(w):
pixel = list(image.getpixel((j, i)))
for k in range(3): # For R, G, B components
if message_index < len(binary_message):
M = int(binary_message[message_index])
# Perform XOR operation with the 7th bit of the RGB component
pixel[k] = (pixel[k] & 0xFE) | (((pixel[k] >> 1) & 1) ^ M)
message_index += 1
else:
break # No more message bits to embed
image.putpixel((j, i), tuple(pixel))
# Save the Stego Image
image.save(output_path)
def xor_substitution(self, component, bit):
# Perform XOR on the least significant bit of the component with the bit
return (component & 0xFE) | (component & 1) ^ bit
def extract_text(self, image_path):
stego_image = Image.open(image_path)
w, h = stego_image.size
binary_message = ""
for i in range(h):
for j in range(w):
pixel = stego_image.getpixel((j, i))
for k in range(3):
binary_message += str(pixel[k] & 1)
# Extract only up to the NULL character
end = binary_message.find('00000000')
if end != -1:
binary_message = binary_message[:end]
return self.binary_to_string(binary_message)
def binary_to_string(self, binary_message):
text = ""
for i in range(0, len(binary_message), 8):
byte = binary_message[i:i+8]
text += chr(int(byte, 2))
return text
As I said, these methods do well at embedding and extracting Latin text from the images I throw at it. When I try to embed things like
το
قي
8
I get things like
ñ;
ÈY♣
[]
In attempts to fix this issue, I have changed the 8-bit values in the lines that denote '08b' to '16b'. I found that the program still manages to embed things into images but takes out Japanese Kanji or Chinese characters.
Heere is the code I changed:
def embed_text(self, image_path, text, output_path):
# Convert text message to binary format
binary_message = ''.join(format(ord(char), '16b') for char in text)
# Load the image
image = Image.open(image_path)
w, h = image.size
# Calculate the number of embedding characters (eN)
eN = (h * w * 3) // 16
if len(text) > eN:
raise ValueError("Message too long to fit in the image")
binary_message = binary_message.ljust(eN * 16, '0')
# Embedding loop
message_index = 0
for i in range(h):
for j in range(w):
pixel = list(image.getpixel((j, i)))
for k in range(3): # For R, G, B components
if message_index < len(binary_message):
M = int(binary_message[message_index:message_index+16], 2)
# Perform XOR operation with the 7th bit of the RGB component
pixel[k] = (pixel[k] & 0xFFFE) | (((pixel[k] >> 1) & 1) ^ M)
message_index += 16
else:
break # No more message bits to embed
image.putpixel((j, i), tuple(pixel))
# Save the Stego Image
image.save(output_path)
def xor_substitution(self, component, bit):
# Perform XOR on the least significant bit of the component with the bit
return (component & 0xFE) | (component & 1) ^ bit
def extract_text(self, image_path):
stego_image = Image.open(image_path)
w, h = stego_image.size
binary_message = ""
for i in range(h):
for j in range(w):
pixel = stego_image.getpixel((j, i))
for k in range(3):
binary_message += format(pixel[k] & 1, 'b').zfill(16)[-1]
# Extract only up to the NULL character
end = binary_message.find('00000000')
if end != -1:
binary_message = binary_message[:end]
return self.binary_to_string(binary_message)
def binary_to_string(self, binary_message):
text = ""
for i in range(0, len(binary_message), 16):
byte = binary_message[i:i+16]
text += chr(int(byte, 2))
return text
I'd like to know how I can fix these issues my program has since I need to have this implementation finished by the 5th of December. Thanks in advance for your assistance!
Minimal changes to original code to make it work, with comments:
More efficient algorithm that removes int/str/int conversions and only processes input file up to the null: