Code search
Code search¶
We index our own openai-python code repository, and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files.
In [1]:
Copied!
import os
from glob import glob
import pandas as pd
def get_function_name(code):
"""
Extract function name from a line beginning with "def "
"""
assert code.startswith("def ")
return code[len("def "): code.index("(")]
def get_until_no_space(all_lines, i) -> str:
"""
Get all lines until a line outside the function definition is found.
"""
ret = [all_lines[i]]
for j in range(i + 1, i + 10000):
if j < len(all_lines):
if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
ret.append(all_lines[j])
else:
break
return "\n".join(ret)
def get_functions(filepath):
"""
Get all functions in a Python file.
"""
whole_code = open(filepath).read().replace("\r", "\n")
all_lines = whole_code.split("\n")
for i, l in enumerate(all_lines):
if l.startswith("def "):
code = get_until_no_space(all_lines, i)
function_name = get_function_name(code)
yield {"code": code, "function_name": function_name, "filepath": filepath}
# get user root directory
root_dir = os.path.expanduser("~")
# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory
# path to code repository directory
code_root = root_dir + "/openai-python"
code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))
if len(code_files) == 0:
print("Double check that you have downloaded the openai-python repo and set the code_root variable correctly.")
all_funcs = []
for code_file in code_files:
funcs = list(get_functions(code_file))
for func in funcs:
all_funcs.append(func)
print("Total number of functions extracted:", len(all_funcs))
import os
from glob import glob
import pandas as pd
def get_function_name(code):
"""
Extract function name from a line beginning with "def "
"""
assert code.startswith("def ")
return code[len("def "): code.index("(")]
def get_until_no_space(all_lines, i) -> str:
"""
Get all lines until a line outside the function definition is found.
"""
ret = [all_lines[i]]
for j in range(i + 1, i + 10000):
if j < len(all_lines):
if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
ret.append(all_lines[j])
else:
break
return "\n".join(ret)
def get_functions(filepath):
"""
Get all functions in a Python file.
"""
whole_code = open(filepath).read().replace("\r", "\n")
all_lines = whole_code.split("\n")
for i, l in enumerate(all_lines):
if l.startswith("def "):
code = get_until_no_space(all_lines, i)
function_name = get_function_name(code)
yield {"code": code, "function_name": function_name, "filepath": filepath}
# get user root directory
root_dir = os.path.expanduser("~")
# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory
# path to code repository directory
code_root = root_dir + "/openai-python"
code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))
if len(code_files) == 0:
print("Double check that you have downloaded the openai-python repo and set the code_root variable correctly.")
all_funcs = []
for code_file in code_files:
funcs = list(get_functions(code_file))
for func in funcs:
all_funcs.append(func)
print("Total number of functions extracted:", len(all_funcs))
Total number of py files: 51 Total number of functions extracted: 97
In [2]:
Copied!
from openai.embeddings_utils import get_embedding
df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("data/code_search_openai-python.csv", index=False)
df.head()
from openai.embeddings_utils import get_embedding
df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("data/code_search_openai-python.csv", index=False)
df.head()
Out[2]:
code | function_name | filepath | code_embedding | |
---|---|---|---|---|
0 | def _console_log_level():\n if openai.log i... | _console_log_level | /openai/util.py | [0.03389773145318031, -0.004390408284962177, 0... |
1 | def log_debug(message, **params):\n msg = l... | log_debug | /openai/util.py | [-0.004034275189042091, 0.004895383026450872, ... |
2 | def log_info(message, **params):\n msg = lo... | log_info | /openai/util.py | [0.004882764536887407, 0.0033515947870910168, ... |
3 | def log_warn(message, **params):\n msg = lo... | log_warn | /openai/util.py | [0.002535992069169879, -0.010829543694853783, ... |
4 | def logfmt(props):\n def fmt(key, val):\n ... | logfmt | /openai/util.py | [0.016732551157474518, 0.017367802560329437, 0... |
In [3]:
Copied!
from openai.embeddings_utils import cosine_similarity
def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
embedding = get_embedding(code_query, engine='text-embedding-ada-002')
df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))
res = df.sort_values('similarities', ascending=False).head(n)
if pprint:
for r in res.iterrows():
print(r[1].filepath+":"+r[1].function_name + " score=" + str(round(r[1].similarities, 3)))
print("\n".join(r[1].code.split("\n")[:n_lines]))
print('-'*70)
return res
res = search_functions(df, 'Completions API tests', n=3)
from openai.embeddings_utils import cosine_similarity
def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
embedding = get_embedding(code_query, engine='text-embedding-ada-002')
df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))
res = df.sort_values('similarities', ascending=False).head(n)
if pprint:
for r in res.iterrows():
print(r[1].filepath+":"+r[1].function_name + " score=" + str(round(r[1].similarities, 3)))
print("\n".join(r[1].code.split("\n")[:n_lines]))
print('-'*70)
return res
res = search_functions(df, 'Completions API tests', n=3)
/openai/tests/test_endpoints.py:test_completions score=0.826 def test_completions(): result = openai.Completion.create(prompt="This was a test", n=5, engine="ada") assert len(result.choices) == 5 ---------------------------------------------------------------------- /openai/tests/test_endpoints.py:test_completions_model score=0.811 def test_completions_model(): result = openai.Completion.create(prompt="This was a test", n=5, model="ada") assert len(result.choices) == 5 assert result.model.startswith("ada") ---------------------------------------------------------------------- /openai/tests/test_endpoints.py:test_completions_multiple_prompts score=0.808 def test_completions_multiple_prompts(): result = openai.Completion.create( prompt=["This was a test", "This was another test"], n=5, engine="ada" ) assert len(result.choices) == 10 ----------------------------------------------------------------------
In [4]:
Copied!
res = search_functions(df, 'fine-tuning input data validation logic', n=3)
res = search_functions(df, 'fine-tuning input data validation logic', n=3)
/openai/validators.py:format_inferrer_validator score=0.751 def format_inferrer_validator(df): """ This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification. It will also suggest to use ada and explain train/validation split benefits. """ ft_type = infer_task_type(df) immediate_msg = None ---------------------------------------------------------------------- /openai/validators.py:get_validators score=0.748 def get_validators(): return [ num_examples_validator, lambda x: necessary_column_validator(x, "prompt"), lambda x: necessary_column_validator(x, "completion"), additional_column_validator, non_empty_field_validator, ---------------------------------------------------------------------- /openai/validators.py:infer_task_type score=0.738 def infer_task_type(df): """ Infer the likely fine-tuning task type from the data """ CLASSIFICATION_THRESHOLD = 3 # min_average instances of each class if sum(df.prompt.str.len()) == 0: return "open-ended generation" ----------------------------------------------------------------------
In [5]:
Copied!
res = search_functions(df, 'find common suffix', n=2, n_lines=10)
res = search_functions(df, 'find common suffix', n=2, n_lines=10)
/openai/validators.py:get_common_xfix score=0.793 def get_common_xfix(series, xfix="suffix"): """ Finds the longest common suffix or prefix of all the values in a series """ common_xfix = "" while True: common_xfixes = ( series.str[-(len(common_xfix) + 1) :] if xfix == "suffix" else series.str[: len(common_xfix) + 1] ---------------------------------------------------------------------- /openai/validators.py:common_completion_suffix_validator score=0.778 def common_completion_suffix_validator(df): """ This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation. """ error_msg = None immediate_msg = None optional_msg = None optional_fn = None ft_type = infer_task_type(df) ----------------------------------------------------------------------
In [6]:
Copied!
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)
/openai/cli.py:tools_register score=0.773 def tools_register(parser): subparsers = parser.add_subparsers( title="Tools", help="Convenience client side tools" ) def help(args): parser.print_help() parser.set_defaults(func=help) sub = subparsers.add_parser("fine_tunes.prepare_data") sub.add_argument( "-f", "--file", required=True, help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed." "This should be the local file path.", ) sub.add_argument( "-q", ----------------------------------------------------------------------