@honzakral
grep -i -r 'web.*framework'
WHERE text ILIKE '%python%'
Bible concordance,
finished 1230
{
'description': {
...
'programming': {1},
'python': {0, 1},
'quick': {0, 1},
'reinvent': {0},
...
},
'title': { ... }
}
def index_docs(docs, *fields):
index = defaultdict(
lambda: defaultdict(set))
for id, doc in enumerate(docs):
for field in fields:
for token in analyze(doc[field]):
index[field][token].add(id)
return index
SPLIT_RE = re.compile(r'[^a-zA-Z0-9]')
def tokenize(text):
yield from SPLIT_RE.split(text)
def lowercase(tokens):
for t in tokens:
yield t.lower()
SYNONYMS = {
'rapid': 'quick',
}
def synonyms(tokens):
for t in tokens:
yield SYNONYMS.get(t, t)
def analyze(text):
tokens = tokenize(text)
for token_filter in (lowercase, synonyms):
tokens = token_filter(tokens)
yield from tokens
COMBINE = {
'OR': set.union,
'AND': set.intersection,
}
def search_in_fields(index, query, fields):
for t in analyze(query):
yield COMBINE['OR'](*(index[f][t] for f in fields))
def search(index, query, operator='AND', fields=None):
fields = fields or index.keys()
combine = COMBINE[operator]
return combine(*search_in_fields(index, query, fields))
dict -> list
set -> list
set union/intersect -> merge lists
py*
"monty python"
http://bit.ly/searchpy
@honzakral
http://bit.ly/searchpy