https://docs.python.org/3/library/stdtypes.html#text-sequence-type-str
https://en.wikipedia.org/wiki/Alt_code
Textual data in Python is handled with strings.
String modules: re, string.
+-------------------------+---------------------------+
| Operation | Meaning |
+-------------------------+---------------------------+
| "", '', str() | empty strings |
| S = "abc" | creation |
| S = str(obj) | creation using __str__() |
| S = str(b'...','utf-8') | creation from bytes |
| len(S) | length |
| S1 + S2 | concatenation |
| n * S, S * n | repetition |
| S[i] | get char at index |
| S[i:j] | slice of S (new string) |
| S[i:j:k] | slice of S (new string) |
| S2 = S1[:] | copy |
| S2 = str(S1) | copy |
| for char in S: pass | iteration |
| S1 in S2 | substring (bool) |
| S1 not in S2 | substring (bool) |
| S1.count(S2) | number of occurrences |
| S1.index(S2) | first occurrence |
| max(S), min(S) | the largest|smallest char |
| "%s house" % "old" | formatting (old style) |
| "%s %s" % ("a", "b") | formatting (old style) |
| S.join(iterable) | joining |
| S.format(...) | formatting (Py2.6+, Py3) |
| "{} {}".format(a, b) | formatting |
| f"{a} {b}" | f-string (Py3.6+) |
| del S | remove the name S |
+-------------------------+---------------------------+
In Python 3, the 'str' type contains Unicode characters (code points). The default encoding for Python source code is UTF-8.
There is no separate “character” type. For a non-empty string S, we have S[0] == S[0:1].
# Usual string is unicode. # A name 'unichr' is not defined. # A name 'unicode' is not defined. "one", 'two', "three 'x' plus", 'four "y" minus' 'three \'x\' plus', "four \"y\" minus" # whitespace is ' ', '\t', '\n' "first" (whitespace) 'second' # will be merged to "firstsecond" '''three single quotes''' """three double quotes""" assert u'qwerty' == 'qwerty' # prefix 'u' is ignored "qwerty", '\u0394', "\U00000394"
# Raw strings (disables most escape sequence processing). r'ab\ncd\tef' # 'ab\\ncd\\tef'
# Converting to bytes and from bytes.
word = chr(40960) + 'abcd' + chr(1972) # '\ua000abcd\u07b4' string
[ord(c) for c in word] # [40960, 97, 98, 99, 100, 1972] list of code points (int)
word.encode('utf-8') # b'\xea\x80\x80abcd\xde\xb4' bytes
z = "żółw".encode('utf-8') # b'\xc5\xbc\xc3\xb3\xc5\x82w' bytes
z.decode('utf-8') # 'żółw' (turtle), string
str(z, 'utf-8') # 'żółw', from bytes
# A printable representation of a string, where the non-ASCII characters
# are escaped by using '\x', '\u' or '\U'.
word = 'żółw'
ascii(word) # "'\\u017c\\xf3\\u0142w'", string for print()
print(ascii(word)) # '\u017c\xf3\u0142w', code points
# Numeric entry of Unicode characters is possible in most Unix
# or Unix-like OSs by typing:
# [Ctrl]+[Shift]+[U], the hex number, the space bar or enter key.
# For the no entry sign ⛔ ('\u26d4'):
# [Ctrl]+[Shift]+[U], [2], [6], [D], [4], [Enter].
# S.join(iterable), S is a separator
S.join([S1, S2, S3]) # S1 + S + S2 + S + S3
"-".join("abc") # 'a-b-c'
"-".join("a") # 'a'
# help(str)
S.find(S1) # find the first occurrence of S1, return -1 if S1 is not found
S.replace(S1, S2) # replace S1 (all occurences) with S2
word = "hej"
word.ljust(6) # "hej "
word.rjust(6) # " hej"
word.center(6) # " hej "
# Remove whitespace.
word = " abc "
word.lstrip() # "abc "
word.rstrip() # " abc"
word.strip() # "abc"
word = "...abc.."
word.strip(".") # "abc"
word = "rAz dWa tRzY"
word.lower() # "raz dwa trzy"
word.casefold() # "raz dwa trzy", more aggressive than lower() Py3.3
word.upper() # "RAZ DWA TRZY"
word.capitalize() # "Raz dwa trzy"
word.title() # "Raz Dwa Trzy"
word.swapcase() # "RaZ DwA TrZy"
word.startswith("abc") # check prefix
word.endswith("abc") # check suffix
line = "a\tb c\nd"
line.split() # ['a', 'b', 'c', 'd'], at whitespace
line.split("\t") # ["a", "b c\nd"]
'1,2,3'.split(',') # ['1', '2', '3']
line.splitlines() # ['a\tb c', 'd']
"111".zfill(8) # "00000111"
# Badanie typów znaków.
S.isalnum()
S.isalpha()
S.isascii() # Py3.7
S.isdigit()
S.islower()
S.isupper()
S.isspace() # " " or "\n" or "\t"
# 'bytes' objects behave like immutable sequences of integers from [0, 255].
# bytes() is an immutable version of bytearray().
b"one", b'two', b"three'x", b'four"y'
b'''3 single quotes''', b"""3 double quotes"""
bytes(5) # return b'\x00\x00\x00\x00\x00'
bytes(range(4)) # return b'\x00\x01\x02\x03'
bytes(obj) # copying existing binary data via the buffer protocol
bytes.fromhex('2Ef0 F1f2 ') # b'.\xf0\xf1\xf2', classmethod
# ASCII whitespaces are skipped. b'\x2e' is b'.'
for x in b'.\xf0\xf1\xf2':
print(x) # numbers 46, 240, 241, 242
list(b'abc') # [97, 98, 99]
# 'bytearray' objects are a mutable counterpart to 'bytes' object.
bytearray() # bytearray(b''), empty
bytearray(b'qwerty') # from bytes
bytearray((97, 100, 105)) # bytearray(b'adi'), from integers
tab = bytearray(5) # bytearray(b'\x00\x00\x00\x00\x00'), null bytes
tab[1] = 12 # bytearray(b'\x00\x0c\x00\x00\x00')
bytearray('abcd', 'utf-8') # bytearray(b'abcd'), from string with encoding
list(bytearray(b'abcd')) # [97, 98, 99, 100]