Code search

Code search¶

We index our own openai-python code repository, and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files.

In [1]:

                
                    Copied!
                    
                        
                        
                    
                    

            
import os
from glob import glob
import pandas as pd

def get_function_name(code):
    """
    Extract function name from a line beginning with "def "
    """
    assert code.startswith("def ")
    return code[len("def "): code.index("(")]

def get_until_no_space(all_lines, i) -> str:
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, i + 10000):
        if j < len(all_lines):
            if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
                ret.append(all_lines[j])
            else:
                break
    return "\n".join(ret)

def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    whole_code = open(filepath).read().replace("\r", "\n")
    all_lines = whole_code.split("\n")
    for i, l in enumerate(all_lines):
        if l.startswith("def "):
            code = get_until_no_space(all_lines, i)
            function_name = get_function_name(code)
            yield {"code": code, "function_name": function_name, "filepath": filepath}


# get user root directory
root_dir = os.path.expanduser("~")
# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory

# path to code repository directory
code_root = root_dir + "/openai-python"

code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))

if len(code_files) == 0:
    print("Double check that you have downloaded the openai-python repo and set the code_root variable correctly.")

all_funcs = []
for code_file in code_files:
    funcs = list(get_functions(code_file))
    for func in funcs:
        all_funcs.append(func)

print("Total number of functions extracted:", len(all_funcs))
import os
from glob import glob
import pandas as pd

def get_function_name(code):
    """
    Extract function name from a line beginning with "def "
    """
    assert code.startswith("def ")
    return code[len("def "): code.index("(")]

def get_until_no_space(all_lines, i) -> str:
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, i + 10000):
        if j < len(all_lines):
            if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
                ret.append(all_lines[j])
            else:
                break
    return "\n".join(ret)

def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    whole_code = open(filepath).read().replace("\r", "\n")
    all_lines = whole_code.split("\n")
    for i, l in enumerate(all_lines):
        if l.startswith("def "):
            code = get_until_no_space(all_lines, i)
            function_name = get_function_name(code)
            yield {"code": code, "function_name": function_name, "filepath": filepath}


# get user root directory
root_dir = os.path.expanduser("~")
# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory

# path to code repository directory
code_root = root_dir + "/openai-python"

code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]
print("Total number of py files:", len(code_files))

if len(code_files) == 0:
    print("Double check that you have downloaded the openai-python repo and set the code_root variable correctly.")

all_funcs = []
for code_file in code_files:
    funcs = list(get_functions(code_file))
    for func in funcs:
        all_funcs.append(func)

print("Total number of functions extracted:", len(all_funcs))

Total number of py files: 51
Total number of functions extracted: 97

In [2]:

                
                    Copied!
                    
from openai.embeddings_utils import get_embedding

df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("data/code_search_openai-python.csv", index=False)
df.head()
from openai.embeddings_utils import get_embedding

df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
df.to_csv("data/code_search_openai-python.csv", index=False)
df.head()

Out[2]:

	code	function_name	filepath	code_embedding
0	def _console_log_level():\n if openai.log i...	_console_log_level	/openai/util.py	[0.03389773145318031, -0.004390408284962177, 0...
1	def log_debug(message, **params):\n msg = l...	log_debug	/openai/util.py	[-0.004034275189042091, 0.004895383026450872, ...
2	def log_info(message, **params):\n msg = lo...	log_info	/openai/util.py	[0.004882764536887407, 0.0033515947870910168, ...
3	def log_warn(message, **params):\n msg = lo...	log_warn	/openai/util.py	[0.002535992069169879, -0.010829543694853783, ...
4	def logfmt(props):\n def fmt(key, val):\n ...	logfmt	/openai/util.py	[0.016732551157474518, 0.017367802560329437, 0...

In [3]:

                
                    Copied!
                    
                        
                        
                    
                    

            
from openai.embeddings_utils import cosine_similarity

def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    embedding = get_embedding(code_query, engine='text-embedding-ada-002')
    df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = df.sort_values('similarities', ascending=False).head(n)
    if pprint:
        for r in res.iterrows():
            print(r[1].filepath+":"+r[1].function_name + "  score=" + str(round(r[1].similarities, 3)))
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-'*70)
    return res

res = search_functions(df, 'Completions API tests', n=3)
from openai.embeddings_utils import cosine_similarity

def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    embedding = get_embedding(code_query, engine='text-embedding-ada-002')
    df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = df.sort_values('similarities', ascending=False).head(n)
    if pprint:
        for r in res.iterrows():
            print(r[1].filepath+":"+r[1].function_name + "  score=" + str(round(r[1].similarities, 3)))
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-'*70)
    return res

res = search_functions(df, 'Completions API tests', n=3)

/openai/tests/test_endpoints.py:test_completions  score=0.826
def test_completions():
    result = openai.Completion.create(prompt="This was a test", n=5, engine="ada")
    assert len(result.choices) == 5


----------------------------------------------------------------------
/openai/tests/test_endpoints.py:test_completions_model  score=0.811
def test_completions_model():
    result = openai.Completion.create(prompt="This was a test", n=5, model="ada")
    assert len(result.choices) == 5
    assert result.model.startswith("ada")


----------------------------------------------------------------------
/openai/tests/test_endpoints.py:test_completions_multiple_prompts  score=0.808
def test_completions_multiple_prompts():
    result = openai.Completion.create(
        prompt=["This was a test", "This was another test"], n=5, engine="ada"
    )
    assert len(result.choices) == 10


----------------------------------------------------------------------

In [4]:

                
                    Copied!
                    
res = search_functions(df, 'fine-tuning input data validation logic', n=3)
res = search_functions(df, 'fine-tuning input data validation logic', n=3)

/openai/validators.py:format_inferrer_validator  score=0.751
def format_inferrer_validator(df):
    """
    This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
    It will also suggest to use ada and explain train/validation split benefits.
    """
    ft_type = infer_task_type(df)
    immediate_msg = None
----------------------------------------------------------------------
/openai/validators.py:get_validators  score=0.748
def get_validators():
    return [
        num_examples_validator,
        lambda x: necessary_column_validator(x, "prompt"),
        lambda x: necessary_column_validator(x, "completion"),
        additional_column_validator,
        non_empty_field_validator,
----------------------------------------------------------------------
/openai/validators.py:infer_task_type  score=0.738
def infer_task_type(df):
    """
    Infer the likely fine-tuning task type from the data
    """
    CLASSIFICATION_THRESHOLD = 3  # min_average instances of each class
    if sum(df.prompt.str.len()) == 0:
        return "open-ended generation"
----------------------------------------------------------------------

In [5]:

                
                    Copied!
                    
res = search_functions(df, 'find common suffix', n=2, n_lines=10)
res = search_functions(df, 'find common suffix', n=2, n_lines=10)

/openai/validators.py:get_common_xfix  score=0.793
def get_common_xfix(series, xfix="suffix"):
    """
    Finds the longest common suffix or prefix of all the values in a series
    """
    common_xfix = ""
    while True:
        common_xfixes = (
            series.str[-(len(common_xfix) + 1) :]
            if xfix == "suffix"
            else series.str[: len(common_xfix) + 1]
----------------------------------------------------------------------
/openai/validators.py:common_completion_suffix_validator  score=0.778
def common_completion_suffix_validator(df):
    """
    This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
    """
    error_msg = None
    immediate_msg = None
    optional_msg = None
    optional_fn = None

    ft_type = infer_task_type(df)
----------------------------------------------------------------------

In [6]:

                
                    Copied!
                    
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)

/openai/cli.py:tools_register  score=0.773
def tools_register(parser):
    subparsers = parser.add_subparsers(
        title="Tools", help="Convenience client side tools"
    )

    def help(args):
        parser.print_help()

    parser.set_defaults(func=help)

    sub = subparsers.add_parser("fine_tunes.prepare_data")
    sub.add_argument(
        "-f",
        "--file",
        required=True,
        help="JSONL, JSON, CSV, TSV, TXT or XLSX file containing prompt-completion examples to be analyzed."
        "This should be the local file path.",
    )
    sub.add_argument(
        "-q",
----------------------------------------------------------------------