Просто мне нужно было прочесть спецификацию xml

Чего я не сделал

Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
RestrictedChar ::= [#x1-#x8] | [#xB-#xC] | [#xE-#x1F] | [#x7F-#x84] | [#x86-#x9F]
По неведомой причине в документе каждый смайлик начинается с #x1

В итоге решил проблему вот так:
def sanitize_xml(filename_in: AnyStr,
need_delete_source_file: bool = True) -> None:
assert os.path.exists(filename_in)
need_delete = [r'[\u0000–\u0008]']
while True:
n = gen_random_str()
p = '{p}{s}{f}.xml'.format(p=TMP_DIR, s=os.sep, f=n)
if not os.path.exists(p):
tmp_file = p
break
with open(tmp_file, 'w', encoding='utf-16-le') as f_out:
with open(filename_in, 'r', encoding='utf-16-le') as f_in:
last_pos = 0
f_in.seek(last_pos)
line = f_in.readline()
last_pos = f_in.tell()
while line:
new_line = line
for pattern in need_delete:
new_line = re.sub(pattern, r'', new_line)
f_out.write(new_line)
f_in.seek(last_pos)
line = f_in.readline()
last_pos = f_in.tell()
base_path, name = os.path.split(filename_in)
assert base_path == os.path.split(tmp_file)[0], 'WTF?'
print(base_path)
if need_delete_source_file:
os.remove(filename_in)
else:
os.rename(
filename_in,
'{p}{s}{new}-{rnd}.{old}'.format(
p=base_path, s=os.sep, old=name,
new='src', rnd=gen_random_str(),
)
)
os.rename(tmp_file, filename_in)