Spaces:

thisisam
/

fara-7b-chat-test

Runtime error

App Files Files Community

thisisam commited on Dec 3, 2025

Commit

2cddae5

1 Parent(s): faf508c

Switch to loading Fara-7B directly with transformers

Browse files

Files changed (2) hide show

app.py +68 -393
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -1,410 +1,85 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-import os
 from PIL import Image
-import requests
-from io import BytesIO
-# Initialize the Inference Client
-client = InferenceClient(token=os.getenv("HF_TOKEN"))
-def create_demo_screenshot(task_type="general"):
     """
-    Create a simple placeholder screenshot for demo purposes
-    In actual use, this would be a real browser screenshot
     """
-    # For now, return None - we'll use text-only mode
-    return None
-def chat_with_fara(message, history, image=None):
-    """
-    Interact with Fara-7B using the vision-language model API
-    """
     try:
-        # Build the proper message format for Fara-7B
-        system_prompt = """You are a web automation agent that performs actions on websites to fulfill user requests by calling various tools.
-You should stop execution at Critical Points. A Critical Point occurs in tasks like:
-- Checkout, Book, Purchase, Call, Email, Order
-A Critical Point requires the user's permission or personal/sensitive information (name, email, credit card, address, payment information, resume, etc.) to complete a transaction (purchase, reservation, sign-up, etc.), or to communicate as a human would (call, email, apply to a job, etc.).
-Guideline: Solve the task as far as possible up until a Critical Point.
-Examples:
-- If the task is to "call a restaurant to make a reservation," do not actually make the call. Instead, navigate to the restaurant's page and find the phone number.
-- If the task is to "order new size 12 running shoes," do not place the order. Instead, search for the right shoes that meet the criteria and add them to the cart.
-Some tasks, like answering questions, may not encounter a Critical Point at all."""
-        # Prepare messages in the format expected by Fara-7B
-        messages = [
-            {"role": "system", "content": system_prompt}
-        ]
-        # Add history
-        if history:
-            for h in history:
-                if h["role"] in ["user", "assistant"]:
-                    messages.append(h)
-        # Add current message
-        user_content = []
-        # Add image if provided
-        if image is not None:
-            user_content.append({"type": "image", "image": image})
-        # Add text
-        user_content.append({"type": "text", "text": message})
-        messages.append({
-            "role": "user",
-            "content": user_content if len(user_content) > 1 else message
-        })
-        # Try to use the Inference API
-        try:
-            response = client.chat_completion(
-                messages=messages,
-                model="microsoft/Fara-7B",
-                max_tokens=512,
-                temperature=0.7,
-            )
-            # Extract the response
-            if hasattr(response, 'choices') and len(response.choices) > 0:
-                return response.choices[0].message.content
-            else:
-                raise Exception("Unexpected response format")
-        except Exception as api_error:
-            error_str = str(api_error).lower()
-            # Check for specific errors
-            if "no api" in error_str or "not found" in error_str or "404" in error_str:
-                # Model doesn't have Inference API - provide helpful demo response
-                return generate_demo_response(message)
-            elif "401" in error_str or "unauthorized" in error_str:
-                return """❌ **Authentication Error**
-Please check:
-1. Your `HF_TOKEN` is set in Space secrets
-2. You have requested access to [microsoft/Fara-7B](https://huggingface.co/microsoft/Fara-7B)
-3. Your token has read permissions
-To use Fara-7B locally instead:
-```bash
-git clone https://github.com/microsoft/fara.git
-cd fara
-pip install -e .
-playwright install
-vllm serve "microsoft/Fara-7B" --port 5000
-```
-"""
-            elif "403" in error_str or "forbidden" in error_str:
-                return """❌ **Access Forbidden**
-You need to request access to the model:
-1. Visit: https://huggingface.co/microsoft/Fara-7B
-2. Click "Request access to this repository"
-3. Wait for Microsoft to approve your request
-Once approved, make sure your `HF_TOKEN` is set in Space secrets.
-"""
-            else:
-                # Unknown error - try demo mode
-                return f"⚠️ API Error: {str(api_error)}\n\n**Demo Response:**\n\n" + generate_demo_response(message)
     except Exception as e:
-        return f"❌ Error: {str(e)}\n\nPlease check the Space logs for more details."
-def generate_demo_response(message):
-    """
-    Generate a helpful demo response when the API is not available
-    """
-    message_lower = message.lower()
-    # Shopping/E-commerce tasks
-    if any(word in message_lower for word in ['buy', 'shop', 'purchase', 'order', 'cart', 'shoes', 'product']):
-        return """🛒 **Task: Shopping/Purchase**
-**Action Plan:**
-1. 🔍 Navigate to e-commerce website
-2. 🔎 Search for: [extracted product from your query]
-3. 📋 Apply filters: price, rating, availability
-4. ✅ Select best match
-5. ➕ Add to cart
-6. 🛑 **STOP** - Critical Point: Checkout requires payment info
-**What I would do with a screenshot:**
-- Identify search bar location
-- Read product listings
-- Click appropriate buttons
-- Navigate to cart
-**Next steps for you:**
-- Review cart
-- Complete checkout manually
-💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
-"""
-    # Travel/booking tasks
-    elif any(word in message_lower for word in ['flight', 'hotel', 'travel', 'book', 'trip']):
-        return """✈️ **Task: Travel Booking**
-**Action Plan:**
-1. 🌐 Navigate to travel site
-2. 📅 Enter dates and destination
-3. 🔍 Search options
-4. 💰 Sort by price/rating
-5. 📊 Compare top results
-6. 🛑 **STOP** - Critical Point: Booking requires personal info
-**What I would do with a screenshot:**
-- Find date pickers
-- Enter search criteria
-- Click search button
-- Read results table
-**Next steps for you:**
-- Review options
-- Complete booking manually
-💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
-"""
-    # Restaurant tasks
-    elif any(word in message_lower for word in ['restaurant', 'food', 'dining', 'reservation']):
-        return """🍽️ **Task: Restaurant Search**
-**Action Plan:**
-1. 🔎 Search for restaurants
-2. 📍 Filter by location and cuisine
-3. ⭐ Check ratings and reviews
-4. 📞 Find contact info
-5. 🛑 **STOP** - Critical Point: Reservation requires personal info
-**What I would do with a screenshot:**
-- Identify search results
-- Read restaurant details
-- Extract phone number
-- Locate reservation link
-**Next steps for you:**
-- Call or book reservation manually
-💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
-"""
-    # Government/grants (your specific use case!)
-    elif any(word in message_lower for word in ['grant', 'funding', 'government', 'nsw', 'healthcare']):
-        return """🏛️ **Task: Government Grants Research**
-**Action Plan:**
-1. 🌐 Navigate to government grants portal
-2. 🔎 Use search functionality
-3. 📋 Filter by: healthcare, eligibility, deadline
-4. 📊 Extract grant information
-5. ✅ **COMPLETE** - No Critical Point
-**What I would do with a screenshot:**
-- Locate search bar
-- Read grant listings
-- Extract key details:
-  - Grant title
-  - Funding amount
-  - Eligibility criteria
-  - Application deadline
-  - Contact information
-**Example output:**
-```
-Grant: Healthcare Innovation Fund
-Amount: $50,000 - $500,000
-Eligibility: Registered healthcare providers
-Deadline: March 31, 2024
-Link: [grant URL]
-```
-💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
-"""
-    # General response
-    else:
-        return """🤖 **Fara-7B Web Automation Agent**
-I help with web automation tasks! I can:
-✅ Shopping & e-commerce
-✅ Travel & booking
-✅ Restaurant search
-✅ Information extraction
-✅ Government portals & grants
-✅ Account navigation
-**How I work:**
-1. 📸 Analyze browser screenshot (when provided)
-2. 🎯 Understand your goal
-3. 📝 Plan step-by-step actions
-4. 🔧 Use browser tools (click, type, scroll)
-5. 🛑 Stop at Critical Points (checkout, personal info)
-**Example tasks:**
-- "Find running shoes under $100"
-- "Search for flights to Tokyo"
-- "Find healthcare grants on the NSW government website"
-- "Look up Italian restaurants in Seattle"
-**To use with screenshots:**
-Upload a browser screenshot and describe your task!
-💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM:*
-```bash
-vllm serve "microsoft/Fara-7B" --port 5000 --dtype auto
-```
-"""
-# Create the Gradio interface
-with gr.Blocks(theme=gr.themes.Soft(), title="Fara-7B Chat") as demo:
-    gr.Markdown(
-        """
-        # 🤖 Fara-7B Web Automation Agent
-        **Microsoft's specialized vision-language model for web automation**
-        Fara-7B can analyze browser screenshots and plan web automation tasks.
-        💡 **How to use:**
-        - Upload a browser screenshot (optional)
-        - Describe your web automation task
-        - Fara-7B will plan the actions needed
-        ⚠️ **Note**: The Inference API may not be fully available for this model. For complete functionality including actual browser control, host locally with vLLM (see instructions below).
-        """
-    )
-    with gr.Accordion("📚 About Fara-7B & Setup Instructions", open=False):
-        gr.Markdown("""
-        ### What is Fara-7B?
-        Fara-7B is a 7B parameter vision-language model designed for computer use. It can:
-        - Understand browser screenshots
-        - Plan multi-step web automation tasks
-        - Use tools (click, type, scroll, etc.)
-        - Stop at "Critical Points" for safety
-        ### Using Transformers Library (Colab/Local)
-        ```python
-        from transformers import pipeline
-        pipe = pipeline("image-text-to-text", model="microsoft/Fara-7B")
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": "screenshot.jpg"},
-                    {"type": "text", "text": "Find running shoes"}
-                ]
-            },
-        ]
-        result = pipe(text=messages)
-        ```
-        ### Full Browser Automation (Local)
-        ```bash
-        # Clone repository
-        git clone https://github.com/microsoft/fara.git
-        cd fara
-        # Setup environment
-        python3 -m venv .venv
-        source .venv/bin/activate
-        pip install -e .
-        playwright install
-        # Host model
-        vllm serve "microsoft/Fara-7B" --port 5000 --dtype auto
-        # Run tasks
-        fara-cli --task "your task here"
-        ```
-        **Resources:**
-        - Model: https://huggingface.co/microsoft/Fara-7B
-        - GitHub: https://github.com/microsoft/fara
-        """)
-    chatbot = gr.Chatbot(
-        height=500,
-        label="Chat",
-        show_label=True,
-        type="messages"
-    )
     with gr.Row():
-        with gr.Column(scale=3):
-            msg = gr.Textbox(
-                label="Task Description",
-                placeholder="Example: Find healthcare grants on the NSW government website...",
-                lines=2
-            )
         with gr.Column(scale=1):
-            image_input = gr.Image(
-                label="Browser Screenshot (Optional)",
-                type="pil",
-                height=100
-            )
-    with gr.Row():
-        send_btn = gr.Button("Send", variant="primary")
-        clear_btn = gr.Button("Clear Chat")
-    gr.Markdown("""
-    ### 💡 Tips for Best Results
-    - **With screenshot**: Upload a browser screenshot and describe what you want to accomplish
-    - **Without screenshot**: Describe the web task, and Fara-7B will plan the approach
-    - **Be specific**: Include details like website, search criteria, budget, etc.
-    - **Critical Points**: Fara-7B will stop before checkout, booking, or entering personal info
-    ### 🎯 Example Tasks
-    - "Find healthcare grants for digital health projects in Australia"
-    - "Search for running shoes under $100 on this e-commerce page"
-    - "Look up restaurants in Seattle with 4+ stars for Italian food"
-    - "Find the contact information on this website"
-    """)
-    def respond(message, image, chat_history):
-        if not message.strip():
-            return chat_history, None
-        # Add user message to history
-        user_msg = {"role": "user", "content": message}
-        chat_history.append(user_msg)
-        # Get response from Fara
-        response = chat_with_fara(message, chat_history, image)
-        # Add assistant response to history
-        assistant_msg = {"role": "assistant", "content": response}
-        chat_history.append(assistant_msg)
-        return chat_history, None
-    def clear_chat():
-        return [], None
-    msg.submit(respond, [msg, image_input, chatbot], [chatbot, image_input]).then(
-        lambda: ("", None), None, [msg, image_input]
-    )
-    send_btn.click(respond, [msg, image_input, chatbot], [chatbot, image_input]).then(
-        lambda: ("", None), None, [msg, image_input]
-    )
-    clear_btn.click(clear_chat, outputs=[chatbot, image_input])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from transformers import AutoProcessor, AutoModelForVision2Seq
+import torch
 from PIL import Image
+# Load model and processor directly
+# Using device_map="auto" to handle GPU/CPU automatically
+print("Loading Fara-7B model...")
+processor = AutoProcessor.from_pretrained("microsoft/Fara-7B", trust_remote_code=True)
+model = AutoModelForVision2Seq.from_pretrained(
+    "microsoft/Fara-7B",
+    trust_remote_code=True,
+    device_map="auto",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+)
+print("Model loaded successfully!")
+def chat(message, history, image):
     """
+    Chat function using the local Fara-7B model
     """
+    if not message and not image:
+        return "Please provide text or an image."
+    # Prepare content list for the model
+    content = []
+    # Add image if provided
+    if image:
+        content.append({"type": "image", "image": image})
+    # Add text
+    if message:
+        content.append({"type": "text", "text": message})
+    elif image:
+        # If only image is provided, ask for description
+        content.append({"type": "text", "text": "Describe this image and what actions I can take."})
+    # Construct messages
+    messages = [
+        {
+            "role": "user",
+            "content": content
+        }
+    ]
     try:
+        # Process inputs
+        # The processor handles the image and text formatting
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device)
+        # Generate response
+        outputs = model.generate(**inputs, max_new_tokens=500)
+        # Decode response
+        generated_text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
+        return generated_text
     except Exception as e:
+        return f"Error generating response: {str(e)}"
+# Create a simple Gradio interface
+with gr.Blocks(title="Fara-7B Simple Chat") as demo:
+    gr.Markdown("# 🤖 Fara-7B Simple Chat")
+    gr.Markdown("Running microsoft/Fara-7B directly using transformers.")
     with gr.Row():
         with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Upload Screenshot (Optional)")
+        with gr.Column(scale=2):
+            chatbot = gr.ChatInterface(
+                fn=chat,
+                additional_inputs=[image_input],
+                type="messages"
+            )
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 gradio==5.0.2
 huggingface-hub==0.26.2
-Pillow

 gradio==5.0.2
 huggingface-hub==0.26.2
+Pillow
+transformers
+torch
+accelerate