Code search
Code search¶
We index our own openai-python code repository, and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files.
In [1]:
Copied!
import os
from glob import glob
import pandas as pd
def get_function_name(code):
"""
Extract function name from a line beginning with "def "
"""
assert code.startswith("def ")
return code[len("def "): code.index("(")]
def get_until_no_space(all_lines, i) -> str:
"""
Get all lines until a line outside the function definition is found.
"""
ret = [all_lines[i]]
for j in range(i + 1, i + 10000):
if j < len(all_lines):
if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
ret.append(all_lines[j])
else:
break
return "\n".join(ret)
def get_functions(filepath):
"""
Get all functions in a Python file.
"""
whole_code = open(filepath).read().replace("\r", "\n")
all_lines = whole_code.split("\n")
for i, l in enumerate(all_lines):
if l.startswith("def "):
code = get_until_no_space(all_lines, i)
function_name = get_function_name(code)
yield {"code": code, "function_name": function_name, "filepath": filepath}
# get user root directory
root_dir = os.path.expanduser("~")
# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory
# path to code repository directory
code_root = root_dir + "/openai-python"
code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))
if len(code_files) == 0:
print("Double check that you have downloaded the openai-python repo and set the code_root variable correctly.")
all_funcs = []
for code_file in code_files:
funcs = list(get_functions(code_file))
for func in funcs:
all_funcs.append(func)
print("Total number of functions extracted:", len(all_funcs))
import os
from glob import glob
import pandas as pd
def get_function_name(code):
"""
Extract function name from a line beginning with "def "
"""
assert code.startswith("def ")
return code[len("def "): code.index("(")]
def get_until_no_space(all_lines, i) -> str:
"""
Get all lines until a line outside the function definition is found.
"""
ret = [all_lines[i]]
for j in range(i + 1, i + 10000):
if j < len(all_lines):
if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
ret.append(all_lines[j])
else:
break
return "\n".join(ret)
def get_functions(filepath):
"""
Get all functions in a Python file.
"""
whole_code = open(filepath).read().replace("\r", "\n")
all_lines = whole_code.split("\n")
for i, l in enumerate(all_lines):
if l.startswith("def "):
code = get_until_no_space(all_lines, i)
function_name = get_function_name(code)
yield {"code": code, "function_name": function_name, "filepath": filepath}
# get user root directory
root_dir = os.path.expanduser("~")
# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory
# path to code repository directory
code_root = root_dir + "/openai-python"
code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))
if len(code_files) == 0:
print("Double check that you have downloaded the openai-python repo and set the code_root variable correctly.")
all_funcs = []
for code_file in code_files:
funcs = list(get_functions(code_file))
for func in funcs:
all_funcs.append(func)
print("Total number of functions extracted:", len(all_funcs))
Total number of py files: 51 Total number of functions extracted: 97
In [2]:
Copied!
from openai.embeddings_utils import get_embedding
df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("data/code_search_openai-python.csv", index=False)
df.head()
from openai.embeddings_utils import get_embedding
df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("data/code_search_openai-python.csv", index=False)
df.head()
Out[2]:
| code | function_name | filepath | code_embedding | |
|---|---|---|---|---|
| 0 | def _console_log_level():\n if openai.log i... | _console_log_level | /openai/util.py | [0.03389773145318031, -0.004390408284962177, 0... |
| 1 | def log_debug(message, **params):\n msg = l... | log_debug | /openai/util.py | [-0.004034275189042091, 0.004895383026450872, ... |
| 2 | def log_info(message, **params):\n msg = lo... | log_info | /openai/util.py | [0.004882764536887407, 0.0033515947870910168, ... |
| 3 | def log_warn(message, **params):\n msg = lo... | log_warn | /openai/util.py | [0.002535992069169879, -0.010829543694853783, ... |
| 4 | def logfmt(props):\n def fmt(key, val):\n ... | logfmt | /openai/util.py | [0.016732551157474518, 0.017367802560329437, 0... |
In [3]:
Copied!
from openai.embeddings_utils import cosine_similarity
def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
embedding = get_embedding(code_query, engine='text-embedding-ada-002')
df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))
res = df.sort_values('similarities', ascending=False).head(n)
if pprint:
for r in res.iterrows():
print(r[1].filepath+":"+r[1].function_name + " score=" + str(round(r[1].similarities, 3)))
print("\n".join(r[1].code.split("\n")[:n_lines]))
print('-'*70)
return res
res = search_functions(df, 'Completions API tests', n=3)
from openai.embeddings_utils import cosine_similarity
def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
embedding = get_embedding(code_query, engine='text-embedding-ada-002')
df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))
res = df.sort_values('similarities', ascending=False).head(n)
if pprint:
for r in res.iterrows():
print(r[1].filepath+":"+r[1].function_name + " score=" + str(round(r[1].similarities, 3)))
print("\n".join(r[1].code.split("\n")[:n_lines]))
print('-'*70)
return res
res = search_functions(df, 'Completions API tests', n=3)
/openai/tests/test_endpoints.py:test_completions score=0.826
def test_completions():
result = openai.Completion.create(prompt="This was a test", n=5, engine="ada")
assert len(result.choices) == 5
----------------------------------------------------------------------
/openai/tests/test_endpoints.py:test_completions_model score=0.811
def test_completions_model():
result = openai.Completion.create(prompt="This was a test", n=5, model="ada")
assert len(result.choices) == 5
assert result.model.startswith("ada")
----------------------------------------------------------------------
/openai/tests/test_endpoints.py:test_completions_multiple_prompts score=0.808
def test_completions_multiple_prompts():
result = openai.Completion.create(
prompt=["This was a test", "This was another test"], n=5, engine="ada"
)
assert len(result.choices) == 10
----------------------------------------------------------------------
In [4]:
Copied!
res = search_functions(df, 'fine-tuning input data validation logic', n=3)
res = search_functions(df, 'fine-tuning input data validation logic', n=3)
/openai/validators.py:format_inferrer_validator score=0.751
def format_inferrer_validator(df):
"""
This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
It will also suggest to use ada and explain train/validation split benefits.
"""
ft_type = infer_task_type(df)
immediate_msg = None
----------------------------------------------------------------------
/openai/validators.py:get_validators score=0.748
def get_validators():
return [
num_examples_validator,
lambda x: necessary_column_validator(x, "prompt"),
lambda x: necessary_column_validator(x, "completion"),
additional_column_validator,
non_empty_field_validator,
----------------------------------------------------------------------
/openai/validators.py:infer_task_type score=0.738
def infer_task_type(df):
"""
Infer the likely fine-tuning task type from the data
"""
CLASSIFICATION_THRESHOLD = 3 # min_average instances of each class
if sum(df.prompt.str.len()) == 0:
return "open-ended generation"
----------------------------------------------------------------------
In [5]:
Copied!
res = search_functions(df, 'find common suffix', n=2, n_lines=10)
res = search_functions(df, 'find common suffix', n=2, n_lines=10)
/openai/validators.py:get_common_xfix score=0.793
def get_common_xfix(series, xfix="suffix"):
"""
Finds the longest common suffix or prefix of all the values in a series
"""
common_xfix = ""
while True:
common_xfixes = (
series.str[-(len(common_xfix) + 1) :]
if xfix == "suffix"
else series.str[: len(common_xfix) + 1]
----------------------------------------------------------------------
/openai/validators.py:common_completion_suffix_validator score=0.778
def common_completion_suffix_validator(df):
"""
This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
"""
error_msg = None
immediate_msg = None
optional_msg = None
optional_fn = None
ft_type = infer_task_type(df)
----------------------------------------------------------------------
In [6]:
Copied!
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)
/openai/cli.py:tools_register score=0.773
def tools_register(parser):
subparsers = parser.add_subparsers(
title="Tools", help="Convenience client side tools"
)
def help(args):
parser.print_help()
parser.set_defaults(func=help)
sub = subparsers.add_parser("fine_tunes.prepare_data")
sub.add_argument(
"-f",
"--file",
required=True,
help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed."
"This should be the local file path.",
)
sub.add_argument(
"-q",
----------------------------------------------------------------------