View file src/colab/openai_api

View file src/colab/openai_api_pub.py - Download

# -*- coding: utf-8 -*-
"""openai_api_pub.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/10BYB9ANxfFZ9m1SX0fNLD5jscXpgJbt7

# OpenAI API

Links :

*   https://platform.openai.com/
*   https://platform.openai.com/docs/introduction
*   https://platform.openai.com/docs/api-reference/streaming
*   https://platform.openai.com/docs/quickstart?context=python
*   https://platform.openai.com/docs/guides/text-generation/chat-completions-api

# Install OpenAI
"""

!pip install openai
from openai import OpenAI

"""# OpenAI API key

* Click on this link : https://platform.openai.com/api-keys

* Click on "+ Create new secret key"

* Copy and paste your key below after "%env OPENAI_API_KEY="

"""

# Commented out IPython magic to ensure Python compatibility.
# %env OPENAI_API_KEY=
!echo $OPENAI_API_KEY

"""# Billing

* Click on this link : https://platform.openai.com/account/billing/overview

* Add a payment method.

* Add an amount to credit balance.

# Create OpenAI client
"""

client = OpenAI()

"""# Chat"""

stream = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Say this is a test"}],
    stream=True,
)
for chunk in stream:
    if chunk.choices[0].delta.content is not None:
        print(chunk.choices[0].delta.content, end="")

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message.content)

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},
    {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
    {"role": "user", "content": "Where was it played?"}
  ]
)

for choice in response.choices:
  print(choice.message.content)

"""# Assistant

* https://platform.openai.com/docs/assistants/overview

* https://platform.openai.com/docs/assistants/how-it-works

"""

import time
from IPython.display import Markdown

assistant = client.beta.assistants.create(
    name="Math Tutor",
    instructions="You are a personal math tutor. Write and run code to answer math questions.",
    tools=[{"type": "code_interpreter"}],
    model="gpt-4-1106-preview"
)

thread = client.beta.threads.create()

message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    # content="I need to solve the equation `3x + 11 = 14`. Can you help me?"
    content="I need to solve the equation `x² = 2`. Can you help me?"
)

run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Please address the user as Jane Doe. The user has a premium account."
)

while True:
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )
  print(run.status)
  if run.status != 'queued' and run.status != 'in_progress':
    break
  time.sleep(1)

messages = client.beta.threads.messages.list(
  thread_id=thread.id
)

answer = messages.data[0].content[0].text.value
answer = answer.replace('\(','$') \
               .replace('\)','$') \
               .replace('\[','\n$') \
               .replace('\]','$\n')
display(Markdown(answer))

"""# Speech

* https://platform.openai.com/docs/guides/text-to-speech
* https://platform.openai.com/docs/guides/speech-to-text
"""

# Commented out IPython magic to ensure Python compatibility.
# %%writefile speech.sh
# curl https://api.openai.com/v1/audio/speech \
#   -H "Authorization: Bearer $OPENAI_API_KEY" \
#   -H "Content-Type: application/json" \
#   -d '{
#     "model": "tts-1",
#     "input": "The quick brown fox jumped over the lazy dog.",
#     "voice": "alloy"
#   }' \
#   --output fox.mp3
#

!sh speech.sh

from google.colab import files
# files.download("fox.mp3")

speech_file_path = "speech.mp3"
response = client.audio.speech.create(
  model="tts-1",
  voice="alloy",
  input="Today is a wonderful day to build something people love!"
)

response.stream_to_file(speech_file_path)

# files.download("speech.mp3")

audio_file= open("speech.mp3", "rb")
transcript = client.audio.transcriptions.create(
  model="whisper-1",
  file=audio_file
)
print(transcript.text)

"""# Images

* https://platform.openai.com/docs/guides/images?context=node
* https://platform.openai.com/docs/guides/vision

"""

!pip install pillow
import urllib.request
from PIL import Image
from IPython.display import display
import textwrap

"""Example of image generation from a short description ("an astronaut riding a horse") and detailed description of the generated image."""

response = client.images.generate(
  model="dall-e-3",
  prompt="an astronaut riding a horse",
  size="1024x1024",
  quality="standard",
  n=1,
)

generated_image_url = response.data[0].url
# print(image_url)

generated_image_name = "astronaut.jpg"
urllib.request.urlretrieve(generated_image_url, generated_image_name)
generated_image = Image.open(generated_image_name)
generated_image.thumbnail((500, 500))
display(generated_image)

response_astronaut = client.chat.completions.create(
  model="gpt-4-vision-preview",
  messages=[
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image?"},
        {
          "type": "image_url",
          "image_url": {
            "url": generated_image_url,
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

print("")

content = response_astronaut.choices[0].message.content
content = textwrap.wrap(content, width=80)
for line in content:
  print(line)

url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

imageName1="file1.jpg"
urllib.request.urlretrieve(url, imageName1)
img1 = Image.open(imageName1)
img1.thumbnail((600, 400))
# display(img1)  # this works but only if it is the only output

response = client.chat.completions.create(
  model="gpt-4-vision-preview",
  messages=[
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image?"},
        {
          "type": "image_url",
          "image_url": {
            "url": url,
          },
        },
      ],
    }
  ],
  max_tokens=300,
)

# print(response.choices[0].message.content)

display(img1)
print("")

# import textwrap
content = response.choices[0].message.content
content = textwrap.wrap(content, width=80)
for line in content:
  print(line)

"""## Embedding

https://clemenssiebler.com/posts/azure-openai-service-embeddings-tutorial/

https://www.datacamp.com/tutorial/introduction-to-text-embeddings-with-the-open-ai-api

https://platform.openai.com/docs/guides/embeddings/use-cases

Install requirements
"""

!pip install openai scipy plotly-express scikit-learn umap-learn

import os
import openai
from scipy.spatial import distance
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP

"""The following helper function can be used to embed a line of text using the OpenAI API. In the code, we are using the existing ada version 2 to generate the embeddings."""

default_model = "text-embedding-ada-002"
# default_model = "davinci"

def get_embedding(text, model=default_model):
   text = text.replace("\n", " ")
   embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   return embedding

# for old version
# def get_embedding(text_to_embed):
#     # Embed a line of text
#     response = openai.Embedding.create(
#         model= "text-embedding-ada-002",
#         input=[text_to_embed]
#     )
#     # Extract the AI output embedding as a list of floats
#     embedding = response["data"][0]["embedding"]
#    return embedding

import torch

def torch_embedding(text):
  return torch.tensor(get_embedding(text))

man = torch_embedding("man")
print(man)
woman = torch_embedding("woman")
boy = torch_embedding("boy")
girl = torch_embedding("girl")
king = torch_embedding("king")
queen = torch_embedding("queen")
house = torch_embedding("house")
castle = torch_embedding("castle")

def distance1(x, y):
  return torch.sum((x - y).pow(2)).pow(0.5)

def distance2(x, y):
  return torch.nn.CosineSimilarity(0)(x, y)

dist = distance2

print(dist(man, woman))
print(dist(boy, girl))
print(dist(man, boy))
print(dist(man, girl))
print(dist(woman, boy))
print(dist(woman, girl))
print(dist(king, queen))
print(dist(man, king))
print(dist(man, queen))
print(dist(woman, king))
print(dist(woman, queen))
print(dist(man, house))
print(dist(man, castle))
print(dist(king, house))
print(dist(king, castle))

"""Display a word map using a given reducer.

3 algorithms are used to reduce the dimensionality of the embedding space:


*   PCA : Principal Component analysis
*   TSNE : T-distributed Stochastic Neighbor Embedding
*   UMAP : Uniform Manifold Approximation and Projection

PCA gives always the same result, the two other give different results when applied successively several times.


"""

def wordmap(words, reducer):
  embeddings = [get_embedding(word) for word in words]
  embeddings_2d = reducer.fit_transform(torch.tensor(embeddings))
  fig = plt.figure(figsize=(5,5))
  ax = fig.add_subplot(111)
  ax = ax.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1])
  for i, txt in enumerate(words):
    plt.text(embeddings_2d[i,0], embeddings_2d[i,1], txt[0:30])
  fig.show()

"""Example of word list."""

words = ["man", "woman", "boy", "girl", "king", "queen", "house", "castle", "dog", "wolf", "cat", "lion"]

"""Examples of word maps."""

wordmap(words, PCA(n_components=2))

wordmap(words, TSNE(n_components=2, perplexity=6))

wordmap(words, TSNE(n_components=2, perplexity=6))

wordmap(words, UMAP())

wordmap(words, UMAP())

"""In this section, we will consider the Amazon musical instrument review data freely available from Kaggle. The data can also be downloaded from my Github account as follows:"""

import pandas as pd

data_URL =  "https://raw.githubusercontent.com/keitazoumana/Experimentation-Data/main/Musical_instruments_reviews.csv"

review_df = pd.read_csv(data_URL)
review_df.head()

"""Out of all the columns, we are only interested in the reviewText column."""

review_df = review_df[['reviewText']]
print("Data shape: {}".format(review_df.shape))
display(review_df.head())

"""There are many reviews in the dataset. For cost optimization purpose we will only use 100 randomly selected rows."""

review_df = review_df.sample(12)
review_df["embedding"] = review_df["reviewText"].astype(str).apply(get_embedding)

# Make the index start from 0
review_df.reset_index(drop=True)

review_df.head(10)

print(review_df)

"""Using K-means clustering requires predefining the number of clusters to use, and we will set that number to 3 with the n_clusters parameter as follows:

"""

kmeans = KMeans(n_clusters=3)
kmeans.fit(review_df["embedding"].tolist())

"""Humans are typically only able to visualize up to three dimensions. This section will use the UMAP, a relatively fast and scalable tool to perform dimensionality reduction.

First, we define an instance of the UMAP class and apply the fit_transform function to the embeddings, which generates a two-dimensional representation of the reviews embedding that can be plotted.

"""

reducer = UMAP()
embeddings_2d = reducer.fit_transform(review_df["embedding"].tolist())

"""Finally, create a scatter plot of the 2-dimensional embeddings. The x and y coordinates are respectively taken from embeddings_2d[: , 0] and embeddings_2d[: , 1]

The clusters will be visually distinct:

"""

fig = px.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], color=kmeans.labels_, width=400, height=400)
fig.show()

"""There are overall three main clusters with different colors. The color of each review in the figure is determined by the cluster label/number assigned to it by the K-Means model. Also, the positioning of each point gives a visual representation of how similar a given review of the others.

"""

import matplotlib.pyplot as plt
fig2 = plt.figure(figsize=(5,5))
ax2 = fig2.add_subplot(111)
ax2 = ax2.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1])
for i, txt in enumerate(review_df["reviewText"]):
  plt.text(embeddings_2d[i,0], embeddings_2d[i,1], txt[0:30])

fig2.show()

for item in review_df["reviewText"]:
  print(item)