https://docs.python.org/3/library/stdtypes.html#text-sequence-type-str
https://en.wikipedia.org/wiki/Alt_code
Textual data in Python is handled with strings.
String modules: re, string.
+-------------------------+---------------------------+ | Operation | Meaning | +-------------------------+---------------------------+ | "", '', str() | empty strings | | S = "abc" | creation | | S = str(obj) | creation using __str__() | | S = str(b'...','utf-8') | creation from bytes | | len(S) | length | | S1 + S2 | concatenation | | n * S, S * n | repetition | | S[i] | get char at index | | S[i:j] | slice of S (new string) | | S[i:j:k] | slice of S (new string) | | S2 = S1[:] | copy | | S2 = str(S1) | copy | | for char in S: pass | iteration | | S1 in S2 | substring (bool) | | S1 not in S2 | substring (bool) | | S1.count(S2) | number of occurrences | | S1.index(S2) | first occurrence | | max(S), min(S) | the largest|smallest char | | "%s house" % "old" | formatting (old style) | | "%s %s" % ("a", "b") | formatting (old style) | | S.join(iterable) | joining | | S.format(...) | formatting (Py2.6+, Py3) | | "{} {}".format(a, b) | formatting | | f"{a} {b}" | f-string (Py3.6+) | | del S | remove the name S | +-------------------------+---------------------------+
In Python 3, the 'str' type contains Unicode characters (code points). The default encoding for Python source code is UTF-8.
There is no separate “character” type. For a non-empty string S, we have S[0] == S[0:1].
# Usual string is unicode. # A name 'unichr' is not defined. # A name 'unicode' is not defined. "one", 'two', "three 'x' plus", 'four "y" minus' 'three \'x\' plus', "four \"y\" minus" # whitespace is ' ', '\t', '\n' "first" (whitespace) 'second' # will be merged to "firstsecond" '''three single quotes''' """three double quotes""" assert u'qwerty' == 'qwerty' # prefix 'u' is ignored "qwerty", '\u0394', "\U00000394"
# Raw strings (disables most escape sequence processing). r'ab\ncd\tef' # 'ab\\ncd\\tef'
# Converting to bytes and from bytes. word = chr(40960) + 'abcd' + chr(1972) # '\ua000abcd\u07b4' string [ord(c) for c in word] # [40960, 97, 98, 99, 100, 1972] list of code points (int) word.encode('utf-8') # b'\xea\x80\x80abcd\xde\xb4' bytes z = "żółw".encode('utf-8') # b'\xc5\xbc\xc3\xb3\xc5\x82w' bytes z.decode('utf-8') # 'żółw' (turtle), string str(z, 'utf-8') # 'żółw', from bytes
# A printable representation of a string, where the non-ASCII characters # are escaped by using '\x', '\u' or '\U'. word = 'żółw' ascii(word) # "'\\u017c\\xf3\\u0142w'", string for print() print(ascii(word)) # '\u017c\xf3\u0142w', code points # Numeric entry of Unicode characters is possible in most Unix # or Unix-like OSs by typing: # [Ctrl]+[Shift]+[U], the hex number, the space bar or enter key. # For the no entry sign ⛔ ('\u26d4'): # [Ctrl]+[Shift]+[U], [2], [6], [D], [4], [Enter].
# S.join(iterable), S is a separator S.join([S1, S2, S3]) # S1 + S + S2 + S + S3 "-".join("abc") # 'a-b-c' "-".join("a") # 'a'
# help(str) S.find(S1) # find the first occurrence of S1, return -1 if S1 is not found S.replace(S1, S2) # replace S1 (all occurences) with S2 word = "hej" word.ljust(6) # "hej " word.rjust(6) # " hej" word.center(6) # " hej " # Remove whitespace. word = " abc " word.lstrip() # "abc " word.rstrip() # " abc" word.strip() # "abc" word = "...abc.." word.strip(".") # "abc" word = "rAz dWa tRzY" word.lower() # "raz dwa trzy" word.casefold() # "raz dwa trzy", more aggressive than lower() Py3.3 word.upper() # "RAZ DWA TRZY" word.capitalize() # "Raz dwa trzy" word.title() # "Raz Dwa Trzy" word.swapcase() # "RaZ DwA TrZy" word.startswith("abc") # check prefix word.endswith("abc") # check suffix line = "a\tb c\nd" line.split() # ['a', 'b', 'c', 'd'], at whitespace line.split("\t") # ["a", "b c\nd"] '1,2,3'.split(',') # ['1', '2', '3'] line.splitlines() # ['a\tb c', 'd'] "111".zfill(8) # "00000111" # Badanie typów znaków. S.isalnum() S.isalpha() S.isascii() # Py3.7 S.isdigit() S.islower() S.isupper() S.isspace() # " " or "\n" or "\t"
# 'bytes' objects behave like immutable sequences of integers from [0, 255]. # bytes() is an immutable version of bytearray(). b"one", b'two', b"three'x", b'four"y' b'''3 single quotes''', b"""3 double quotes""" bytes(5) # return b'\x00\x00\x00\x00\x00' bytes(range(4)) # return b'\x00\x01\x02\x03' bytes(obj) # copying existing binary data via the buffer protocol bytes.fromhex('2Ef0 F1f2 ') # b'.\xf0\xf1\xf2', classmethod # ASCII whitespaces are skipped. b'\x2e' is b'.' for x in b'.\xf0\xf1\xf2': print(x) # numbers 46, 240, 241, 242 list(b'abc') # [97, 98, 99]
# 'bytearray' objects are a mutable counterpart to 'bytes' object. bytearray() # bytearray(b''), empty bytearray(b'qwerty') # from bytes bytearray((97, 100, 105)) # bytearray(b'adi'), from integers tab = bytearray(5) # bytearray(b'\x00\x00\x00\x00\x00'), null bytes tab[1] = 12 # bytearray(b'\x00\x0c\x00\x00\x00') bytearray('abcd', 'utf-8') # bytearray(b'abcd'), from string with encoding list(bytearray(b'abcd')) # [97, 98, 99, 100]