View file src/colab/florence_2_large_sample_inference.py - Download

# -*- coding: utf-8 -*-
"""florence_2_large_sample_inference.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1OuC9ipJzdM_Ka88Nc3MGBYzlf88YfKub

# Florence-2-large sample usage
"""

pip install einops timm flash_attn

"""Source :

https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb

https://colab.research.google.com/#fileId=https%3A//huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb

"""

# Commented out IPython magic to ensure Python compatibility.
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import requests
import copy
# %matplotlib inline

model_id = 'microsoft/Florence-2-large'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).eval()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

"""## define the prediction function"""

def run_example(task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )

    return parsed_answer

"""## init image"""

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw)

image

"""## Run pre-defined tasks without additional inputs

### Caption
"""

task_prompt = ''
run_example(task_prompt)

task_prompt = ''
run_example(task_prompt)

task_prompt = ''
run_example(task_prompt)

"""### Object detection

OD results format:
{'\':
    {
    'bboxes': [[x1, y1, x2, y2], ...],
    'labels': ['label1', 'label2', ...]
    }
}
"""

task_prompt = ''
results = run_example(task_prompt)
print(results)

import matplotlib.pyplot as plt
import matplotlib.patches as patches
def plot_bbox(image, data):
   # Create a figure and axes
    fig, ax = plt.subplots()

    # Display the image
    ax.imshow(image)

    # Plot each bounding box
    for bbox, label in zip(data['bboxes'], data['labels']):
        # Unpack the bounding box coordinates
        x1, y1, x2, y2 = bbox
        # Create a Rectangle patch
        rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, edgecolor='r', facecolor='none')
        # Add the rectangle to the Axes
        ax.add_patch(rect)
        # Annotate the label
        plt.text(x1, y1, label, color='white', fontsize=8, bbox=dict(facecolor='red', alpha=0.5))

    # Remove the axis ticks and labels
    ax.axis('off')

    # Show the plot
    plt.show()

plot_bbox(image, results[''])

"""### Dense region caption

Dense region caption results format:
{'\': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['label1', 'label2', ...]}}
"""

task_prompt = ''
results = run_example(task_prompt)
print(results)

plot_bbox(image, results[''])

"""### Region proposal

Region proposal results format:
{'' : {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
"""

task_prompt = ''
results = run_example(task_prompt)
print(results)

plot_bbox(image, results[''])

"""## Run pre-defined tasks that requires additional inputs

### Phrase Grounding
Phrase grounding results format:
{'\': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
"""

task_prompt = ''
results = run_example(task_prompt, text_input="A green car parked in front of a yellow building.")
print(results)

plot_bbox(image, results[''])

"""### Referring expression segmentation

Referring expression segmentation results format:
{'\': {'Polygons': [[[polygon]], ...], 'labels': ['', '', ...]}}, one object is represented by a list of polygons. each polygon is [x1, y1, x2, y2, ..., xn, yn]
"""

task_prompt = ''
results = run_example(task_prompt, text_input="a green car")
print(results)

from PIL import Image, ImageDraw, ImageFont
import random
import numpy as np
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
            'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
def draw_polygons(image, prediction, fill_mask=False):
    """
    Draws segmentation masks with polygons on an image.

    Parameters:
    - image_path: Path to the image file.
    - prediction: Dictionary containing 'polygons' and 'labels' keys.
                  'polygons' is a list of lists, each containing vertices of a polygon.
                  'labels' is a list of labels corresponding to each polygon.
    - fill_mask: Boolean indicating whether to fill the polygons with color.
    """
    # Load the image

    draw = ImageDraw.Draw(image)


    # Set up scale factor if needed (use 1 if not scaling)
    scale = 1

    # Iterate over polygons and labels
    for polygons, label in zip(prediction['polygons'], prediction['labels']):
        color = random.choice(colormap)
        fill_color = random.choice(colormap) if fill_mask else None

        for _polygon in polygons:
            _polygon = np.array(_polygon).reshape(-1, 2)
            if len(_polygon) < 3:
                print('Invalid polygon:', _polygon)
                continue

            _polygon = (_polygon * scale).reshape(-1).tolist()

            # Draw the polygon
            if fill_mask:
                draw.polygon(_polygon, outline=color, fill=fill_color)
            else:
                draw.polygon(_polygon, outline=color)

            # Draw the label text
            draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)

    # Save or display the image
    #image.show()  # Display the image
    display(image)

output_image = copy.deepcopy(image)
draw_polygons(output_image, results[''], fill_mask=True)

"""### region to segmentation


with additional region as inputs, format is '\\\\', [x1, y1, x2, y2] is the quantized corrdinates in [0, 999].
"""

task_prompt = ''
results = run_example(task_prompt, text_input="")
print(results)

output_image = copy.deepcopy(image)
draw_polygons(output_image, results[''], fill_mask=True)

"""### Open vocabulary detection

open vocabulary detection can detect both objects and ocr texts.

results format:

{ '\': {'bboxes': [[x1, y1, x2, y2], [x1, y1, x2, y2], ...]], 'bboxes_labels': ['label_1', 'label_2', ..],
'polygons': [[[x1, y1, x2, y2, ..., xn, yn], [x1, y1, ..., xn, yn]], ...], 'polygons_labels': ['label_1', 'label_2', ...]
}}
"""

task_prompt = ''
results = run_example(task_prompt, text_input="a green car")
print(results)

def convert_to_od_format(data):
    """
    Converts a dictionary with 'bboxes' and 'bboxes_labels' into a dictionary with separate 'bboxes' and 'labels' keys.

    Parameters:
    - data: The input dictionary with 'bboxes', 'bboxes_labels', 'polygons', and 'polygons_labels' keys.

    Returns:
    - A dictionary with 'bboxes' and 'labels' keys formatted for object detection results.
    """
    # Extract bounding boxes and labels
    bboxes = data.get('bboxes', [])
    labels = data.get('bboxes_labels', [])

    # Construct the output format
    od_results = {
        'bboxes': bboxes,
        'labels': labels
    }

    return od_results

bbox_results  = convert_to_od_format(results[''])

plot_bbox(image, bbox_results)

"""### region to texts"""

task_prompt = ''
results = run_example(task_prompt, text_input="")
print(results)

task_prompt = ''
results = run_example(task_prompt, text_input="")
print(results)

"""## ocr related tasks"""

url = "http://ecx.images-amazon.com/images/I/51UUzBDAMsL.jpg?download=true"
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')

image

task_prompt = ''
run_example(task_prompt)

task_prompt = ''
results = run_example(task_prompt)
print(results)
# ocr results format
# {'OCR_WITH_REGION': {'quad_boxes': [[x1, y1, x2, y2, x3, y3, x4, y4], ...], 'labels': ['text1', ...]}}

def draw_ocr_bboxes(image, prediction):
    scale = 1
    draw = ImageDraw.Draw(image)
    bboxes, labels = prediction['quad_boxes'], prediction['labels']
    for box, label in zip(bboxes, labels):
        color = random.choice(colormap)
        new_box = (np.array(box) * scale).tolist()
        draw.polygon(new_box, width=3, outline=color)
        draw.text((new_box[0]+8, new_box[1]+2),
                    "{}".format(label),
                    align="right",

                    fill=color)
    display(image)

output_image = copy.deepcopy(image)
draw_ocr_bboxes(output_image, results[''])