u'\0' (NUL)
u'\t' u'\n' (Control characters)
< > & ' ",
é æ €
' String literal delimiter " ` [ ] Identifier delimiter /* */ // -- %
({ }) Comment delimiter
More...
cursor.execute(
u"SELECT name, ... FROM users "
u"WHERE username = '" + username + u"'")
cursor.execute(
u"SELECT name, ... FROM users "
u"WHERE username = '%s'"
% (username,))
def escape(s):
return s.replace(u"'", u'\\')
cursor.execute(
u"SELECT name, ... FROM users "
u"WHERE username = '%s'"
% (escape(username),))
cursor.execute(u"SELECT name, ... FROM users "
u"WHERE username = ?",
(username,))
import psycopg2
import psycopg2.extensions
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY)
conn = psycopg2.connect(...)
cursor = conn.cursor()
cursor.execute(u"SELECT name, ... FROM users "
u"WHERE username = %s",
(username,))
, \t | : Field delimiter
\n \n\f \f Row delimiter " ' Field quote Space
data = [[42.0, u'Tartan paint',
u"Björk's DIY", u'€19.95'], [1, u'Hammer, left handed',
u'"Crazy" Eddie', u'¥2000']] with open('output.csv', 'w') as csv_out: for row in data:
item = unicode(item) line = ','.join(item.encode('utf-8') for item in row) csv_out.write(line + '\n')
# -*- coding: utf-8 -*- import csv, codecs cStringIO class UTF8Recoder: """Iterator that reads an encoded stream and reencodes the input to UTF-8""" ... class UnicodeWriter: """A CSV writer which will write rows to CSV file "f", which is encoded in the given encoding."""
... data = [[42.0, u'Tartan paint', u"Björk's DIY", u'€19.95'], [1, u'Hammer, left handed', u'"Crazy" Eddie', u'¥2000']] with codecs.open('wildgeese.csv', 'wb') as csv_out: csv_out.write(codecs.BOM_UTF8) writer = UnicodeWriter(csv_out) writer.writerows(data)
Give up on TSV and CSV, use .xls with xlwt
# -*- coding: utf-8 -*- import xlwt data = [[4.0, u'Tartan paint',
u"Björk's DIY", u'€19.95'], [1, u'Hammer, left handed',
u'"Crazy" Eddie', u'¥2000']] workbook = xlwt.Workbook(encoding='utf-8') sheet = workbook.add_sheet(u'Sheet 1¼') for i, row in enumerate(data): for j, item in enumerate(row): sheet.write(i, j, item) workbook.save('wildgeese.xls')
{ } Object delimiter
[ ] Array delimiter
" String delimiter>
data = {u'text': u'ℙƴ☂ℌøἤ',
u'number': 42,
u'bool': True,
u'sequence': (1,2,3,4)}
repr(data)
data = {u'text': u'ℙƴ☂ℌøἤ', u'number': 42, u'bool': True, u'sequence': (1,2,3,4)}
json.dumps(data)
data = {u'text': u'ℙƴ☂ℌøἤ', u'number': 42, u'bool': True, u'sequence': (1,2,3,4)}
json.dumps(data, ensure_ascii=False)
& < > ' " Syntactic elements
` (parsed by Internet Explorer)
data = {u'name': u'Alice "Ali" Gator', u'role': u'CEO, <&> design Inc.'}
u'<a href="..." title="%(name)s">%(role)s</a>' % data
data = {u'name':
cgi.escape(u'Alice "Ali" Gator'), u'role':
cgi.escape(u'CEO, <&> design Inc.')}
u'<a href="..." title="%(name)s">%(role)s</a>' % data
data = {u'name':
cgi.escape(u'Bob "Croc" O\'Doyle',
quote=True), u'role':
cgi.escape(u'CEO, <&> design Inc.')}
u'<a href="..." title="%(name)s">%(role)s</a>' % data
from xml.sax.saxutils import quoteattr
data = {
u'name':
quoteattr(u'Bob "Croc" O\'Doyle'), u'role':
cgi.escape(u'CEO, <&> design Inc.')}
u'<a href="..." title="%(name)s">%(role)s</a>' % data
import jinja2 data = {u'name': u'Bob "Croc" O\'Doyle', u'role': u'CEO, <&> design Inc.'}
env = jinja2.Environment(autoescape=True) template = env.from_string( u'<a href="..." title="{{name}}">'
u'{{role}}</a>') template.render(data)
import genshi.template
data = {u'name': u'Bob "Croc" O\'Doyle', u'role': u'CEO, <&> design Inc.'} template = genshi.template.MarkupTemplate( u'<a href="..." title="${name}">'
u'${role}</a>') stream = template.generate(**data) stream.render('html')
<![CDATA[ ]]> is not a useful escape mechanism,
it's just a convenience for hand written markup
Control characters are not valid,
even escaped as character entities
http://www.w3.org/TR/REC-xml/#charsets
All unicode characters execept NUL are valid, some restricted characters must be escaped
http://www.w3.org/TR/xml11/#charsets
Nearly everything, depending on section of the URL and schema
needle = u'needle+thread@example.com' haystack = (u'hay@straw.com\n' u'needle+thread@example.com\n' u'christian@bale.com') match = re.search(u'^%s$' % needle,
haystack,
re.UNICODE|re.MULTILINE)
needle = u'needle+thread@example.com' haystack = (u'hay@straw.com\n' u'needle+thread@example.com\n' u'christian@bale.com') match = re.search(u'^%s$'
% re.escape(needle),
haystack, re.UNICODE|re.MULTILINE)