Textual data in Python is handled with strings.
# Usual 8-bit string, isinstance(word, str) is True. # bytes() is an alias for str(). "one", 'two', "three 'x' plus", 'four "y" minus' "first" (whitespace) 'second' # will be merged to "firstsecond" '''three single quotes''' """three double quotes"""
# Raw strings (disables most escape sequence processing). r"raw string\n\t\a"
# Unicode strings, isinstance(word, unicode) is True.
unicode('unicode') # u"unicode", default 'ascii' decoding
unicode('unicode', 'utf-8') # also works, compatibility
s = 'dół' # from a keyboard, there are unicode chars
repr(s) # 'd\xc3\xb3\xc5\x82', default 'utf-8' encoding
print(s) # dół
#u = unicode("dół") # UnicodeDecodeError
u = unicode("dół", 'utf-8')
repr(u) # u'd\xf3\u0142', code points
print(u) # dół
u = u"żółw" # from a keybord (turtle in English)
repr(u) # u'\u017c\xf3\u0142w', code points
print(u) # żółw
u'\u0061\u0062\u0063' = u'abc' = unicode('abc'), ASCII 97, 98, 99
# Raw Unicode strings. ur'raw\tunicode\nline' # u'raw\\tunicode\\nline'
# One-character Unicode strings can also be created with the
# unichr() built-in function.
chr(97) # return 'a', 8-bit
chr(255) # return '\xff', 8-bit
ord('\xff') # return 255
unichr(97) # return u'a' = u'\x61' = u'\u0061' = u'\U00000061'
unichr(256) # return u'\u0100'
unichr(40960) # return u'\ua000'
ord(u'\ua000') # return 40960
u = unichr(40960) + u'abcd' + unichr(1972) # u'\ua000abcd\u07b4'
[ord(c) for c in u] # [40960, 97, 98, 99, 100, 1972]
isinstance(u, unicode) # return True
isinstance(u, str) # return False
# encode() returns an 8-bit string version of the Unicode string
u_utf8 = u.encode('utf-8') # return '\xea\x80\x80abcd\xde\xb4' (no u'...')
isinstance(u_utf8, unicode) # return False
isinstance(u_utf8, str) # return True
# decode() interprets the 8-bit string using the given encoding
u_utf8.decode('utf-8') # return u'\ua000abcd\u07b4'
u'\u0061'.encode('utf-8') # return 'a'
unichr(97) # u'\x61' = u'\u0061', (0)110.0001, 7 bits for code point
# Bytes where the most significant bit is 0 never appear
# in a multi-byte sequence.
# Bytes 0xc0=1100.0000 and 0xc1=1100.0001 must never appear in a valid UTF-8
# sequence, because they could be used only for a 2-byte encoding of a 7-bit
# ASCII character which should be encoded in 1 byte (overlong sequences).
# Bytes 0xf5 to 0xff are also invalid.
unichr(257) # u'\u0101', four-digit Unicode escape
unichr(257).encode('utf-8') # '\xc4\x81', 11 bits for code point
# (110)0.0100.(10)00.0001, continuation bytes start with 10
unichr(1024) # u'\u0400'
unichr(1024).encode('utf-8') # '\xd0\x80',
# (110)1.0000.(10)00.0000
unichr(4096) # u'\u1000', 1 and 12 zeros
unichr(4096).encode('utf-8') # '\xe1\x80\x80', 16 bits for code point
# (1110).0001.(10)00.0000.(10)00.0000
unichr(8364) # u'\u20ac', the Euro sign
unichr(8364).encode('utf-8') # '\xe2\x82\xac'
# (1110).0010.(10)00.0010.(10)10.1100
unichr(4096*8) # u'\u8000', 1 and 15 zeros
unichr(4096*8).encode('utf-8') # '\xe8\x80\x80',
# (1110).1000.(10)00.0000.(10)00.0000
unichr(4096*16) # u'\U00010000', 1 and 16 zeros
unichr(4096*16).encode('utf-8') # '\xf0\x90\x80\x80', 21 bits for code point
# (1111.0)000.(10)01.0000.(10)00.0000.(10)00.0000
unichr(1114111) # u'\U0010ffff'
unichr(1114111).encode('utf-8') # '\xf4\x8f\xbf\xbf'
# (1111.0)100.(10)00.1111.(10)11.1111.(10)11.1111
import unicodedata # a database of information about code points c = unichr(0x0bf2) unicodedata.category(c) # No unicodedata.name(c) # TAMIL NUMBER ONE THOUSAND unicodedata.numeric(c) # 1000.0, numeric value of character
# Bytearray objects are created with the built-in function bytearray().
bytearray() # bytearray(b''), empty
data = bytearray('abcdef') # bytearray(b'abcdef'), from str
data[1] = 'x' # bytearray(b'axcdef'), mutable!
bytearray(6) # bytearray(b'\x00\x00\x00\x00\x00\x00'), null bytes
bytearray((97,100,105)) # bytearray(b'adi'), from integers
bytearray(u'abc', 'utf-8') # bytearray(b'abc'), from unicode with encoding
u"żółw".encode('utf-8') # '\xc5\xbc\xc3\xb3\xc5\x82w', 8-bit
bytearray(u'żółw', 'utf-8') # bytearray(b'\xc5\xbc\xc3\xb3\xc5\x82w')
# https://www.geeksforgeeks.org/python-convert-bytearray-to-hexadecimal-string/?ref=rp
# Python | Convert Bytearray to Hexadecimal String
# Method #1 : Using format() + join()
nlist = [124, 67, 45, 11]
"".join("{:02x}".format(i) for i in nlist) # '7c432d0b'
# Method #2 : Using binascii.hexlify()
import binascii
bytearray(nlist) # bytearray(b'|C-\x0b')
binascii.hexlify(bytearray(nlist)) # '7c432d0b'