thisisam commited on
Commit
2cddae5
·
1 Parent(s): faf508c

Switch to loading Fara-7B directly with transformers

Browse files
Files changed (2) hide show
  1. app.py +68 -393
  2. requirements.txt +4 -1
app.py CHANGED
@@ -1,410 +1,85 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- import os
4
  from PIL import Image
5
- import requests
6
- from io import BytesIO
7
 
8
- # Initialize the Inference Client
9
- client = InferenceClient(token=os.getenv("HF_TOKEN"))
10
-
11
- def create_demo_screenshot(task_type="general"):
 
 
 
 
 
 
 
 
 
12
  """
13
- Create a simple placeholder screenshot for demo purposes
14
- In actual use, this would be a real browser screenshot
15
  """
16
- # For now, return None - we'll use text-only mode
17
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- def chat_with_fara(message, history, image=None):
20
- """
21
- Interact with Fara-7B using the vision-language model API
22
- """
23
  try:
24
- # Build the proper message format for Fara-7B
25
- system_prompt = """You are a web automation agent that performs actions on websites to fulfill user requests by calling various tools.
26
- You should stop execution at Critical Points. A Critical Point occurs in tasks like:
27
- - Checkout, Book, Purchase, Call, Email, Order
28
-
29
- A Critical Point requires the user's permission or personal/sensitive information (name, email, credit card, address, payment information, resume, etc.) to complete a transaction (purchase, reservation, sign-up, etc.), or to communicate as a human would (call, email, apply to a job, etc.).
30
-
31
- Guideline: Solve the task as far as possible up until a Critical Point.
32
-
33
- Examples:
34
- - If the task is to "call a restaurant to make a reservation," do not actually make the call. Instead, navigate to the restaurant's page and find the phone number.
35
- - If the task is to "order new size 12 running shoes," do not place the order. Instead, search for the right shoes that meet the criteria and add them to the cart.
36
-
37
- Some tasks, like answering questions, may not encounter a Critical Point at all."""
38
-
39
- # Prepare messages in the format expected by Fara-7B
40
- messages = [
41
- {"role": "system", "content": system_prompt}
42
- ]
43
-
44
- # Add history
45
- if history:
46
- for h in history:
47
- if h["role"] in ["user", "assistant"]:
48
- messages.append(h)
49
-
50
- # Add current message
51
- user_content = []
52
 
53
- # Add image if provided
54
- if image is not None:
55
- user_content.append({"type": "image", "image": image})
56
-
57
- # Add text
58
- user_content.append({"type": "text", "text": message})
59
-
60
- messages.append({
61
- "role": "user",
62
- "content": user_content if len(user_content) > 1 else message
63
- })
64
-
65
- # Try to use the Inference API
66
- try:
67
- response = client.chat_completion(
68
- messages=messages,
69
- model="microsoft/Fara-7B",
70
- max_tokens=512,
71
- temperature=0.7,
72
- )
73
-
74
- # Extract the response
75
- if hasattr(response, 'choices') and len(response.choices) > 0:
76
- return response.choices[0].message.content
77
- else:
78
- raise Exception("Unexpected response format")
79
-
80
- except Exception as api_error:
81
- error_str = str(api_error).lower()
82
-
83
- # Check for specific errors
84
- if "no api" in error_str or "not found" in error_str or "404" in error_str:
85
- # Model doesn't have Inference API - provide helpful demo response
86
- return generate_demo_response(message)
87
- elif "401" in error_str or "unauthorized" in error_str:
88
- return """❌ **Authentication Error**
89
-
90
- Please check:
91
- 1. Your `HF_TOKEN` is set in Space secrets
92
- 2. You have requested access to [microsoft/Fara-7B](https://huggingface.co/microsoft/Fara-7B)
93
- 3. Your token has read permissions
94
-
95
- To use Fara-7B locally instead:
96
- ```bash
97
- git clone https://github.com/microsoft/fara.git
98
- cd fara
99
- pip install -e .
100
- playwright install
101
- vllm serve "microsoft/Fara-7B" --port 5000
102
- ```
103
- """
104
- elif "403" in error_str or "forbidden" in error_str:
105
- return """❌ **Access Forbidden**
106
-
107
- You need to request access to the model:
108
- 1. Visit: https://huggingface.co/microsoft/Fara-7B
109
- 2. Click "Request access to this repository"
110
- 3. Wait for Microsoft to approve your request
111
-
112
- Once approved, make sure your `HF_TOKEN` is set in Space secrets.
113
- """
114
- else:
115
- # Unknown error - try demo mode
116
- return f"⚠️ API Error: {str(api_error)}\n\n**Demo Response:**\n\n" + generate_demo_response(message)
117
-
118
  except Exception as e:
119
- return f"Error: {str(e)}\n\nPlease check the Space logs for more details."
120
 
121
- def generate_demo_response(message):
122
- """
123
- Generate a helpful demo response when the API is not available
124
- """
125
- message_lower = message.lower()
126
-
127
- # Shopping/E-commerce tasks
128
- if any(word in message_lower for word in ['buy', 'shop', 'purchase', 'order', 'cart', 'shoes', 'product']):
129
- return """🛒 **Task: Shopping/Purchase**
130
-
131
- **Action Plan:**
132
- 1. 🔍 Navigate to e-commerce website
133
- 2. 🔎 Search for: [extracted product from your query]
134
- 3. 📋 Apply filters: price, rating, availability
135
- 4. ✅ Select best match
136
- 5. ➕ Add to cart
137
- 6. 🛑 **STOP** - Critical Point: Checkout requires payment info
138
-
139
- **What I would do with a screenshot:**
140
- - Identify search bar location
141
- - Read product listings
142
- - Click appropriate buttons
143
- - Navigate to cart
144
-
145
- **Next steps for you:**
146
- - Review cart
147
- - Complete checkout manually
148
-
149
- 💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
150
- """
151
-
152
- # Travel/booking tasks
153
- elif any(word in message_lower for word in ['flight', 'hotel', 'travel', 'book', 'trip']):
154
- return """✈️ **Task: Travel Booking**
155
-
156
- **Action Plan:**
157
- 1. 🌐 Navigate to travel site
158
- 2. 📅 Enter dates and destination
159
- 3. 🔍 Search options
160
- 4. 💰 Sort by price/rating
161
- 5. 📊 Compare top results
162
- 6. 🛑 **STOP** - Critical Point: Booking requires personal info
163
-
164
- **What I would do with a screenshot:**
165
- - Find date pickers
166
- - Enter search criteria
167
- - Click search button
168
- - Read results table
169
-
170
- **Next steps for you:**
171
- - Review options
172
- - Complete booking manually
173
-
174
- 💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
175
- """
176
-
177
- # Restaurant tasks
178
- elif any(word in message_lower for word in ['restaurant', 'food', 'dining', 'reservation']):
179
- return """🍽️ **Task: Restaurant Search**
180
-
181
- **Action Plan:**
182
- 1. 🔎 Search for restaurants
183
- 2. 📍 Filter by location and cuisine
184
- 3. ⭐ Check ratings and reviews
185
- 4. 📞 Find contact info
186
- 5. 🛑 **STOP** - Critical Point: Reservation requires personal info
187
-
188
- **What I would do with a screenshot:**
189
- - Identify search results
190
- - Read restaurant details
191
- - Extract phone number
192
- - Locate reservation link
193
-
194
- **Next steps for you:**
195
- - Call or book reservation manually
196
-
197
- 💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
198
- """
199
-
200
- # Government/grants (your specific use case!)
201
- elif any(word in message_lower for word in ['grant', 'funding', 'government', 'nsw', 'healthcare']):
202
- return """🏛️ **Task: Government Grants Research**
203
-
204
- **Action Plan:**
205
- 1. 🌐 Navigate to government grants portal
206
- 2. 🔎 Use search functionality
207
- 3. 📋 Filter by: healthcare, eligibility, deadline
208
- 4. 📊 Extract grant information
209
- 5. ✅ **COMPLETE** - No Critical Point
210
-
211
- **What I would do with a screenshot:**
212
- - Locate search bar
213
- - Read grant listings
214
- - Extract key details:
215
- - Grant title
216
- - Funding amount
217
- - Eligibility criteria
218
- - Application deadline
219
- - Contact information
220
-
221
- **Example output:**
222
- ```
223
- Grant: Healthcare Innovation Fund
224
- Amount: $50,000 - $500,000
225
- Eligibility: Registered healthcare providers
226
- Deadline: March 31, 2024
227
- Link: [grant URL]
228
- ```
229
-
230
- 💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM.*
231
- """
232
-
233
- # General response
234
- else:
235
- return """🤖 **Fara-7B Web Automation Agent**
236
-
237
- I help with web automation tasks! I can:
238
-
239
- ✅ Shopping & e-commerce
240
- ✅ Travel & booking
241
- ✅ Restaurant search
242
- ✅ Information extraction
243
- ✅ Government portals & grants
244
- ✅ Account navigation
245
-
246
- **How I work:**
247
- 1. 📸 Analyze browser screenshot (when provided)
248
- 2. 🎯 Understand your goal
249
- 3. 📝 Plan step-by-step actions
250
- 4. 🔧 Use browser tools (click, type, scroll)
251
- 5. 🛑 Stop at Critical Points (checkout, personal info)
252
-
253
- **Example tasks:**
254
- - "Find running shoes under $100"
255
- - "Search for flights to Tokyo"
256
- - "Find healthcare grants on the NSW government website"
257
- - "Look up Italian restaurants in Seattle"
258
-
259
- **To use with screenshots:**
260
- Upload a browser screenshot and describe your task!
261
-
262
- 💡 *Note: The Inference API may not be available for this model. For full functionality, host locally with vLLM:*
263
- ```bash
264
- vllm serve "microsoft/Fara-7B" --port 5000 --dtype auto
265
- ```
266
- """
267
-
268
- # Create the Gradio interface
269
- with gr.Blocks(theme=gr.themes.Soft(), title="Fara-7B Chat") as demo:
270
- gr.Markdown(
271
- """
272
- # 🤖 Fara-7B Web Automation Agent
273
-
274
- **Microsoft's specialized vision-language model for web automation**
275
-
276
- Fara-7B can analyze browser screenshots and plan web automation tasks.
277
-
278
- 💡 **How to use:**
279
- - Upload a browser screenshot (optional)
280
- - Describe your web automation task
281
- - Fara-7B will plan the actions needed
282
-
283
- ⚠️ **Note**: The Inference API may not be fully available for this model. For complete functionality including actual browser control, host locally with vLLM (see instructions below).
284
- """
285
- )
286
-
287
- with gr.Accordion("📚 About Fara-7B & Setup Instructions", open=False):
288
- gr.Markdown("""
289
- ### What is Fara-7B?
290
-
291
- Fara-7B is a 7B parameter vision-language model designed for computer use. It can:
292
- - Understand browser screenshots
293
- - Plan multi-step web automation tasks
294
- - Use tools (click, type, scroll, etc.)
295
- - Stop at "Critical Points" for safety
296
-
297
- ### Using Transformers Library (Colab/Local)
298
-
299
- ```python
300
- from transformers import pipeline
301
-
302
- pipe = pipeline("image-text-to-text", model="microsoft/Fara-7B")
303
- messages = [
304
- {
305
- "role": "user",
306
- "content": [
307
- {"type": "image", "url": "screenshot.jpg"},
308
- {"type": "text", "text": "Find running shoes"}
309
- ]
310
- },
311
- ]
312
- result = pipe(text=messages)
313
- ```
314
-
315
- ### Full Browser Automation (Local)
316
-
317
- ```bash
318
- # Clone repository
319
- git clone https://github.com/microsoft/fara.git
320
- cd fara
321
-
322
- # Setup environment
323
- python3 -m venv .venv
324
- source .venv/bin/activate
325
- pip install -e .
326
- playwright install
327
-
328
- # Host model
329
- vllm serve "microsoft/Fara-7B" --port 5000 --dtype auto
330
-
331
- # Run tasks
332
- fara-cli --task "your task here"
333
- ```
334
-
335
- **Resources:**
336
- - Model: https://huggingface.co/microsoft/Fara-7B
337
- - GitHub: https://github.com/microsoft/fara
338
- """)
339
-
340
- chatbot = gr.Chatbot(
341
- height=500,
342
- label="Chat",
343
- show_label=True,
344
- type="messages"
345
- )
346
 
347
  with gr.Row():
348
- with gr.Column(scale=3):
349
- msg = gr.Textbox(
350
- label="Task Description",
351
- placeholder="Example: Find healthcare grants on the NSW government website...",
352
- lines=2
353
- )
354
  with gr.Column(scale=1):
355
- image_input = gr.Image(
356
- label="Browser Screenshot (Optional)",
357
- type="pil",
358
- height=100
359
- )
360
-
361
- with gr.Row():
362
- send_btn = gr.Button("Send", variant="primary")
363
- clear_btn = gr.Button("Clear Chat")
364
-
365
- gr.Markdown("""
366
- ### 💡 Tips for Best Results
367
-
368
- - **With screenshot**: Upload a browser screenshot and describe what you want to accomplish
369
- - **Without screenshot**: Describe the web task, and Fara-7B will plan the approach
370
- - **Be specific**: Include details like website, search criteria, budget, etc.
371
- - **Critical Points**: Fara-7B will stop before checkout, booking, or entering personal info
372
-
373
- ### 🎯 Example Tasks
374
-
375
- - "Find healthcare grants for digital health projects in Australia"
376
- - "Search for running shoes under $100 on this e-commerce page"
377
- - "Look up restaurants in Seattle with 4+ stars for Italian food"
378
- - "Find the contact information on this website"
379
- """)
380
-
381
- def respond(message, image, chat_history):
382
- if not message.strip():
383
- return chat_history, None
384
-
385
- # Add user message to history
386
- user_msg = {"role": "user", "content": message}
387
- chat_history.append(user_msg)
388
 
389
- # Get response from Fara
390
- response = chat_with_fara(message, chat_history, image)
391
-
392
- # Add assistant response to history
393
- assistant_msg = {"role": "assistant", "content": response}
394
- chat_history.append(assistant_msg)
395
-
396
- return chat_history, None
397
-
398
- def clear_chat():
399
- return [], None
400
-
401
- msg.submit(respond, [msg, image_input, chatbot], [chatbot, image_input]).then(
402
- lambda: ("", None), None, [msg, image_input]
403
- )
404
- send_btn.click(respond, [msg, image_input, chatbot], [chatbot, image_input]).then(
405
- lambda: ("", None), None, [msg, image_input]
406
- )
407
- clear_btn.click(clear_chat, outputs=[chatbot, image_input])
408
 
409
  if __name__ == "__main__":
410
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForVision2Seq
3
+ import torch
4
  from PIL import Image
 
 
5
 
6
+ # Load model and processor directly
7
+ # Using device_map="auto" to handle GPU/CPU automatically
8
+ print("Loading Fara-7B model...")
9
+ processor = AutoProcessor.from_pretrained("microsoft/Fara-7B", trust_remote_code=True)
10
+ model = AutoModelForVision2Seq.from_pretrained(
11
+ "microsoft/Fara-7B",
12
+ trust_remote_code=True,
13
+ device_map="auto",
14
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
15
+ )
16
+ print("Model loaded successfully!")
17
+
18
+ def chat(message, history, image):
19
  """
20
+ Chat function using the local Fara-7B model
 
21
  """
22
+ if not message and not image:
23
+ return "Please provide text or an image."
24
+
25
+ # Prepare content list for the model
26
+ content = []
27
+
28
+ # Add image if provided
29
+ if image:
30
+ content.append({"type": "image", "image": image})
31
+
32
+ # Add text
33
+ if message:
34
+ content.append({"type": "text", "text": message})
35
+ elif image:
36
+ # If only image is provided, ask for description
37
+ content.append({"type": "text", "text": "Describe this image and what actions I can take."})
38
+
39
+ # Construct messages
40
+ messages = [
41
+ {
42
+ "role": "user",
43
+ "content": content
44
+ }
45
+ ]
46
 
 
 
 
 
47
  try:
48
+ # Process inputs
49
+ # The processor handles the image and text formatting
50
+ inputs = processor.apply_chat_template(
51
+ messages,
52
+ add_generation_prompt=True,
53
+ tokenize=True,
54
+ return_dict=True,
55
+ return_tensors="pt",
56
+ ).to(model.device)
57
+
58
+ # Generate response
59
+ outputs = model.generate(**inputs, max_new_tokens=500)
60
+
61
+ # Decode response
62
+ generated_text = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
63
+ return generated_text
 
 
 
 
 
 
 
 
 
 
 
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  except Exception as e:
66
+ return f"Error generating response: {str(e)}"
67
 
68
+ # Create a simple Gradio interface
69
+ with gr.Blocks(title="Fara-7B Simple Chat") as demo:
70
+ gr.Markdown("# 🤖 Fara-7B Simple Chat")
71
+ gr.Markdown("Running microsoft/Fara-7B directly using transformers.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  with gr.Row():
 
 
 
 
 
 
74
  with gr.Column(scale=1):
75
+ image_input = gr.Image(type="pil", label="Upload Screenshot (Optional)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ with gr.Column(scale=2):
78
+ chatbot = gr.ChatInterface(
79
+ fn=chat,
80
+ additional_inputs=[image_input],
81
+ type="messages"
82
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  if __name__ == "__main__":
85
  demo.launch()
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  gradio==5.0.2
2
  huggingface-hub==0.26.2
3
- Pillow
 
 
 
 
1
  gradio==5.0.2
2
  huggingface-hub==0.26.2
3
+ Pillow
4
+ transformers
5
+ torch
6
+ accelerate