Advanced Process Automation Strategies for Enterprise Workflows#

In the rapidly evolving landscape of digital transformation, enterprise organizations are increasingly turning to sophisticated automation strategies to streamline operations, reduce costs, and improve efficiency. This comprehensive guide explores advanced automation techniques that go beyond simple task automation to create intelligent, adaptive, and scalable workflow systems.

Understanding Enterprise Automation Landscape#

Modern enterprise automation encompasses multiple layers of complexity, from basic rule-based automation to sophisticated AI-driven process orchestration. Understanding this landscape is crucial for developing effective automation strategies that deliver measurable business value.

The Evolution of Enterprise Automation#

Enterprise automation has evolved through several distinct phases:

Phase 1: Task-Level Automation

Simple scripting and batch processing
Repetitive data entry automation
Basic file transfer and data synchronization

Phase 2: Process-Level Automation

Workflow orchestration platforms
Business process management (BPM) systems
Integration of multiple systems and applications

Phase 3: Intelligent Automation

AI and machine learning integration
Natural language processing for document automation
Predictive analytics for proactive process optimization

Phase 4: Adaptive Automation

Self-healing and self-optimizing systems
Dynamic workflow adjustment based on real-time conditions
Continuous learning and improvement mechanisms

Core Components of Enterprise Automation Architecture#

Workflow Orchestration Engine#

The heart of any enterprise automation system is a robust orchestration engine that can coordinate complex, multi-step processes across various systems and departments.

1
from dataclasses import dataclass
2
from typing import Dict, List, Any, Optional, Callable
3
from enum import Enum
4
import asyncio
5
import json
6
import logging
7
from datetime import datetime, timedelta
8
import uuid
9

10
class TaskStatus(Enum):
11
    PENDING = "pending"
12
    RUNNING = "running"
13
    COMPLETED = "completed"
14
    FAILED = "failed"
15
    CANCELLED = "cancelled"
16
    RETRYING = "retrying"
17

18
class WorkflowStatus(Enum):
19
    CREATED = "created"
20
    RUNNING = "running"
21
    COMPLETED = "completed"
22
    FAILED = "failed"
23
    CANCELLED = "cancelled"
24
    PAUSED = "paused"
25

26
@dataclass
27
class Task:
28
    id: str
29
    name: str
30
    function: Callable
31
    parameters: Dict[str, Any]
32
    dependencies: List[str]
33
    retry_count: int = 3
34
    timeout: int = 300  # seconds
35
    status: TaskStatus = TaskStatus.PENDING
36
    result: Any = None
37
    error: Optional[str] = None
38
    started_at: Optional[datetime] = None
39
    completed_at: Optional[datetime] = None
40

41
@dataclass
42
class Workflow:
43
    id: str
44
    name: str
45
    description: str
46
    tasks: List[Task]
47
    status: WorkflowStatus = WorkflowStatus.CREATED
48
    created_at: datetime = None
49
    started_at: Optional[datetime] = None
50
    completed_at: Optional[datetime] = None
51
    metadata: Dict[str, Any] = None
52

53
class WorkflowOrchestrator:
54
    def __init__(self):
55
        self.workflows: Dict[str, Workflow] = {}
56
        self.task_results: Dict[str, Any] = {}
57
        self.logger = logging.getLogger(__name__)
58

59
    async def create_workflow(self, name: str, description: str, tasks: List[Task]) -> str:
60
        """Create a new workflow with specified tasks"""
61
        workflow_id = str(uuid.uuid4())
62

63
        workflow = Workflow(
64
            id=workflow_id,
65
            name=name,
66
            description=description,
67
            tasks=tasks,
68
            created_at=datetime.utcnow(),
69
            metadata={}
70
        )
71

72
        self.workflows[workflow_id] = workflow
73
        self.logger.info(f"Created workflow {workflow_id}: {name}")
74

75
        return workflow_id
76

77
    async def execute_workflow(self, workflow_id: str) -> bool:
78
        """Execute a workflow with dependency management"""
79
        if workflow_id not in self.workflows:
80
            raise ValueError(f"Workflow {workflow_id} not found")
81

82
        workflow = self.workflows[workflow_id]
83
        workflow.status = WorkflowStatus.RUNNING
84
        workflow.started_at = datetime.utcnow()
85

86
        try:
87
            # Create dependency graph
88
            task_map = {task.id: task for task in workflow.tasks}
89
            completed_tasks = set()
90
            failed_tasks = set()
91

92
            while len(completed_tasks) + len(failed_tasks) < len(workflow.tasks):
93
                # Find tasks ready to execute
94
                ready_tasks = []
95
                for task in workflow.tasks:
96
                    if (task.status == TaskStatus.PENDING and
97
                        all(dep in completed_tasks for dep in task.dependencies)):
98
                        ready_tasks.append(task)
99

100
                if not ready_tasks:
101
                    # Check if we're stuck due to failed dependencies
102
                    remaining_tasks = [t for t in workflow.tasks
103
                                     if t.id not in completed_tasks and t.id not in failed_tasks]
104
                    if remaining_tasks:
105
                        self.logger.error(f"Workflow {workflow_id} stuck - no ready tasks")
106
                        workflow.status = WorkflowStatus.FAILED
107
                        return False
108
                    break
109

110
                # Execute ready tasks in parallel
111
                task_futures = []
112
                for task in ready_tasks:
113
                    future = asyncio.create_task(self._execute_task(task, workflow_id))
114
                    task_futures.append((task, future))
115

116
                # Wait for tasks to complete
117
                for task, future in task_futures:
118
                    try:
119
                        success = await future
120
                        if success:
121
                            completed_tasks.add(task.id)
122
                        else:
123
                            failed_tasks.add(task.id)
124

125
                            # Check if this failure should stop the workflow
126
                            if self._is_critical_task(task):
127
                                workflow.status = WorkflowStatus.FAILED
128
                                return False
129

130
                    except Exception as e:
131
                        self.logger.error(f"Task {task.id} failed with exception: {e}")
132
                        failed_tasks.add(task.id)
133

134
            # Check final workflow status
135
            if failed_tasks:
136
                workflow.status = WorkflowStatus.COMPLETED  # Partial success
137
            else:
138
                workflow.status = WorkflowStatus.COMPLETED
139

140
            workflow.completed_at = datetime.utcnow()
141
            return True
142

143
        except Exception as e:
144
            self.logger.error(f"Workflow {workflow_id} failed: {e}")
145
            workflow.status = WorkflowStatus.FAILED
146
            workflow.completed_at = datetime.utcnow()
147
            return False
148

149
    async def _execute_task(self, task: Task, workflow_id: str) -> bool:
150
        """Execute a single task with retry logic"""
151
        task.status = TaskStatus.RUNNING
152
        task.started_at = datetime.utcnow()
153

154
        for attempt in range(task.retry_count + 1):
155
            try:
156
                # Set timeout for task execution
157
                result = await asyncio.wait_for(
158
                    self._run_task_function(task, workflow_id),
159
                    timeout=task.timeout
160
                )
161

162
                task.result = result
163
                task.status = TaskStatus.COMPLETED
164
                task.completed_at = datetime.utcnow()
165

166
                # Store result for dependent tasks
167
                self.task_results[f"{workflow_id}:{task.id}"] = result
168

169
                self.logger.info(f"Task {task.id} completed successfully")
170
                return True
171

172
            except asyncio.TimeoutError:
173
                error_msg = f"Task {task.id} timed out after {task.timeout} seconds"
174
                self.logger.warning(error_msg)
175
                task.error = error_msg
176

177
            except Exception as e:
178
                error_msg = f"Task {task.id} failed: {str(e)}"
179
                self.logger.warning(error_msg)
180
                task.error = error_msg
181

182
            # Retry logic
183
            if attempt < task.retry_count:
184
                task.status = TaskStatus.RETRYING
185
                await asyncio.sleep(2 ** attempt)  # Exponential backoff
186
            else:
187
                task.status = TaskStatus.FAILED
188
                task.completed_at = datetime.utcnow()
189
                return False
190

191
        return False
192

193
    async def _run_task_function(self, task: Task, workflow_id: str) -> Any:
194
        """Run the actual task function"""
195
        # Prepare parameters with context
196
        context = {
197
            'workflow_id': workflow_id,
198
            'task_id': task.id,
199
            'task_results': self.task_results,
200
            'workflow': self.workflows[workflow_id]
201
        }
202

203
        # Merge task parameters with context
204
        execution_params = {**task.parameters, 'context': context}
205

206
        # Execute the function
207
        if asyncio.iscoroutinefunction(task.function):
208
            return await task.function(**execution_params)
209
        else:
210
            return task.function(**execution_params)
211

212
    def _is_critical_task(self, task: Task) -> bool:
213
        """Determine if a task failure should stop the entire workflow"""
214
        # This could be configurable per task
215
        return task.parameters.get('critical', False)
216

217
    def get_workflow_status(self, workflow_id: str) -> Dict[str, Any]:
218
        """Get detailed status of a workflow"""
219
        if workflow_id not in self.workflows:
220
            return {"error": "Workflow not found"}
221

222
        workflow = self.workflows[workflow_id]
223

224
        task_statuses = {}
225
        for task in workflow.tasks:
226
            task_statuses[task.id] = {
227
                'name': task.name,
228
                'status': task.status.value,
229
                'started_at': task.started_at.isoformat() if task.started_at else None,
230
                'completed_at': task.completed_at.isoformat() if task.completed_at else None,
231
                'error': task.error
232
            }
233

234
        return {
235
            'workflow_id': workflow_id,
236
            'name': workflow.name,
237
            'status': workflow.status.value,
238
            'created_at': workflow.created_at.isoformat(),
239
            'started_at': workflow.started_at.isoformat() if workflow.started_at else None,
240
            'completed_at': workflow.completed_at.isoformat() if workflow.completed_at else None,
241
            'tasks': task_statuses
242
        }
243

244
# Example task functions
245
async def extract_customer_data(source_system: str, date_range: str, context: Dict) -> Dict:
246
    """Extract customer data from source system"""
247
    print(f"Extracting customer data from {source_system} for {date_range}")
248

249
    # Simulate data extraction
250
    await asyncio.sleep(2)
251

252
    extracted_data = {
253
        'customers': [
254
            {'id': i, 'name': f'Customer {i}', 'email': f'customer{i}@example.com'}
255
            for i in range(1000)
256
        ],
257
        'extraction_timestamp': datetime.utcnow().isoformat(),
258
        'source_system': source_system
259
    }
260

261
    return extracted_data
262

263
async def transform_customer_data(transformation_rules: List[str], context: Dict) -> Dict:
264
    """Transform customer data according to business rules"""
265

266
    # Get data from previous task
267
    extract_task_result = None
268
    for key, value in context['task_results'].items():
269
        if 'extract_customer_data' in key:
270
            extract_task_result = value
271
            break
272

273
    if not extract_task_result:
274
        raise ValueError("No customer data found from extraction task")
275

276
    print(f"Transforming {len(extract_task_result['customers'])} customer records")
277

278
    # Simulate data transformation
279
    await asyncio.sleep(3)
280

281
    # Apply transformation rules
282
    transformed_customers = []
283
    for customer in extract_task_result['customers']:
284
        transformed_customer = customer.copy()
285

286
        # Apply various transformations
287
        if 'normalize_email' in transformation_rules:
288
            transformed_customer['email'] = customer['email'].lower()
289

290
        if 'add_customer_segment' in transformation_rules:
291
            transformed_customer['segment'] = 'premium' if customer['id'] % 3 == 0 else 'standard'
292

293
        transformed_customers.append(transformed_customer)
294

295
    return {
296
        'customers': transformed_customers,
297
        'transformation_timestamp': datetime.utcnow().isoformat(),
298
        'rules_applied': transformation_rules
299
    }
300

301
async def load_customer_data(destination_system: str, batch_size: int, context: Dict) -> Dict:
302
    """Load transformed customer data to destination system"""
303

304
    # Get data from transformation task
305
    transform_task_result = None
306
    for key, value in context['task_results'].items():
307
        if 'transform_customer_data' in key:
308
            transform_task_result = value
309
            break
310

311
    if not transform_task_result:
312
        raise ValueError("No transformed customer data found")
313

314
    customers = transform_task_result['customers']
315
    print(f"Loading {len(customers)} customer records to {destination_system}")
316

317
    # Simulate batch loading
318
    loaded_count = 0
319
    for i in range(0, len(customers), batch_size):
320
        batch = customers[i:i + batch_size]
321
        await asyncio.sleep(1)  # Simulate network call
322
        loaded_count += len(batch)
323
        print(f"Loaded batch {i//batch_size + 1}, total records: {loaded_count}")
324

325
    return {
326
        'loaded_count': loaded_count,
327
        'destination_system': destination_system,
328
        'load_timestamp': datetime.utcnow().isoformat()
329
    }
330

331
async def send_completion_notification(recipients: List[str], context: Dict) -> Dict:
332
    """Send workflow completion notification"""
333

334
    workflow = context['workflow']
335

336
    message = f"""
337
    Workflow '{workflow.name}' completed successfully!
338

339
    Workflow ID: {workflow.id}
340
    Started: {workflow.started_at}
341
    Completed: {workflow.completed_at}
342
    Duration: {workflow.completed_at - workflow.started_at}
343
    """
344

345
    print(f"Sending notification to {recipients}")
346
    print(message)
347

348
    # Simulate sending notification
349
    await asyncio.sleep(1)
350

351
    return {
352
        'notification_sent': True,
353
        'recipients': recipients,
354
        'timestamp': datetime.utcnow().isoformat()
355
    }
356

357
# Usage example
358
async def create_customer_etl_workflow():
359
    """Create a customer ETL workflow"""
360

361
    orchestrator = WorkflowOrchestrator()
362

363
    # Define tasks
364
    tasks = [
365
        Task(
366
            id="extract_task",
367
            name="Extract Customer Data",
368
            function=extract_customer_data,
369
            parameters={
370
                'source_system': 'CRM_Database',
371
                'date_range': '2024-01-01:2024-01-31'
372
            },
373
            dependencies=[],
374
            timeout=300
375
        ),
376
        Task(
377
            id="transform_task",
378
            name="Transform Customer Data",
379
            function=transform_customer_data,
380
            parameters={
381
                'transformation_rules': ['normalize_email', 'add_customer_segment']
382
            },
383
            dependencies=["extract_task"],
384
            timeout=600
385
        ),
386
        Task(
387
            id="load_task",
388
            name="Load Customer Data",
389
            function=load_customer_data,
390
            parameters={
391
                'destination_system': 'Data_Warehouse',
392
                'batch_size': 100
393
            },
394
            dependencies=["transform_task"],
395
            timeout=900
396
        ),
397
        Task(
398
            id="notify_task",
399
            name="Send Completion Notification",
400
            function=send_completion_notification,
401
            parameters={
402
                'recipients': ['data-team@company.com', 'manager@company.com']
403
            },
404
            dependencies=["load_task"],
405
            timeout=60
406
        )
407
    ]
408

409
    # Create workflow
410
    workflow_id = await orchestrator.create_workflow(
411
        name="Customer ETL Pipeline",
412
        description="Extract, transform, and load customer data from CRM to data warehouse",
413
        tasks=tasks
414
    )
415

416
    print(f"Created workflow: {workflow_id}")
417

418
    # Execute workflow
419
    success = await orchestrator.execute_workflow(workflow_id)
420

421
    # Get final status
422
    status = orchestrator.get_workflow_status(workflow_id)
423
    print(f"Workflow execution {'succeeded' if success else 'failed'}")
424
    print(f"Final status: {json.dumps(status, indent=2)}")
425

426
# Run the example
427
# asyncio.run(create_customer_etl_workflow())

Integration Framework#

Modern enterprise automation requires seamless integration with existing systems, APIs, and data sources.

1
from abc import ABC, abstractmethod
2
from typing import Dict, List, Any, Optional
3
import requests
4
import json
5
import sqlite3
6
import pyodbc
7
from dataclasses import dataclass
8
import logging
9

10
@dataclass
11
class ConnectionConfig:
12
    connection_type: str
13
    host: str
14
    port: int
15
    username: str
16
    password: str
17
    database: str = None
18
    additional_params: Dict[str, Any] = None
19

20
class IntegrationConnector(ABC):
21
    """Abstract base class for all integration connectors"""
22

23
    def __init__(self, config: ConnectionConfig):
24
        self.config = config
25
        self.logger = logging.getLogger(self.__class__.__name__)
26

27
    @abstractmethod
28
    async def connect(self) -> bool:
29
        """Establish connection to the external system"""
30
        pass
31

32
    @abstractmethod
33
    async def disconnect(self) -> bool:
34
        """Close connection to the external system"""
35
        pass
36

37
    @abstractmethod
38
    async def execute_operation(self, operation: str, parameters: Dict[str, Any]) -> Any:
39
        """Execute an operation on the external system"""
40
        pass
41

42
class DatabaseConnector(IntegrationConnector):
43
    """Connector for database systems"""
44

45
    def __init__(self, config: ConnectionConfig):
46
        super().__init__(config)
47
        self.connection = None
48

49
    async def connect(self) -> bool:
50
        """Connect to database"""
51
        try:
52
            if self.config.connection_type.lower() == 'postgresql':
53
                import psycopg2
54
                self.connection = psycopg2.connect(
55
                    host=self.config.host,
56
                    port=self.config.port,
57
                    database=self.config.database,
58
                    user=self.config.username,
59
                    password=self.config.password
60
                )
61
            elif self.config.connection_type.lower() == 'mysql':
62
                import mysql.connector
63
                self.connection = mysql.connector.connect(
64
                    host=self.config.host,
65
                    port=self.config.port,
66
                    database=self.config.database,
67
                    user=self.config.username,
68
                    password=self.config.password
69
                )
70
            elif self.config.connection_type.lower() == 'sqlite':
71
                self.connection = sqlite3.connect(self.config.database)
72
            else:
73
                raise ValueError(f"Unsupported database type: {self.config.connection_type}")
74

75
            self.logger.info(f"Connected to {self.config.connection_type} database")
76
            return True
77

78
        except Exception as e:
79
            self.logger.error(f"Failed to connect to database: {e}")
80
            return False
81

82
    async def disconnect(self) -> bool:
83
        """Disconnect from database"""
84
        try:
85
            if self.connection:
86
                self.connection.close()
87
                self.connection = None
88
            return True
89
        except Exception as e:
90
            self.logger.error(f"Error disconnecting from database: {e}")
91
            return False
92

93
    async def execute_operation(self, operation: str, parameters: Dict[str, Any]) -> Any:
94
        """Execute database operation"""
95
        if not self.connection:
96
            raise RuntimeError("Database connection not established")
97

98
        cursor = self.connection.cursor()
99

100
        try:
101
            if operation.lower() == 'select':
102
                query = parameters.get('query')
103
                cursor.execute(query, parameters.get('params', []))
104

105
                if parameters.get('fetch_all', True):
106
                    return cursor.fetchall()
107
                else:
108
                    return cursor.fetchone()
109

110
            elif operation.lower() in ['insert', 'update', 'delete']:
111
                query = parameters.get('query')
112
                cursor.execute(query, parameters.get('params', []))
113
                self.connection.commit()
114
                return cursor.rowcount
115

116
            elif operation.lower() == 'bulk_insert':
117
                query = parameters.get('query')
118
                data = parameters.get('data')
119
                cursor.executemany(query, data)
120
                self.connection.commit()
121
                return cursor.rowcount
122

123
            else:
124
                raise ValueError(f"Unsupported operation: {operation}")
125

126
        except Exception as e:
127
            self.connection.rollback()
128
            self.logger.error(f"Database operation failed: {e}")
129
            raise
130
        finally:
131
            cursor.close()
132

133
class APIConnector(IntegrationConnector):
134
    """Connector for REST API systems"""
135

136
    def __init__(self, config: ConnectionConfig):
137
        super().__init__(config)
138
        self.base_url = f"https://{config.host}:{config.port}"
139
        self.session = requests.Session()
140
        self.authenticated = False
141

142
    async def connect(self) -> bool:
143
        """Authenticate with API"""
144
        try:
145
            auth_endpoint = self.config.additional_params.get('auth_endpoint', '/auth')
146

147
            auth_data = {
148
                'username': self.config.username,
149
                'password': self.config.password
150
            }
151

152
            response = self.session.post(f"{self.base_url}{auth_endpoint}", json=auth_data)
153
            response.raise_for_status()
154

155
            # Store authentication token
156
            auth_response = response.json()
157
            token = auth_response.get('access_token') or auth_response.get('token')
158

159
            if token:
160
                self.session.headers.update({'Authorization': f'Bearer {token}'})
161
                self.authenticated = True
162
                self.logger.info("API authentication successful")
163
                return True
164
            else:
165
                self.logger.error("No token received from API")
166
                return False
167

168
        except Exception as e:
169
            self.logger.error(f"API authentication failed: {e}")
170
            return False
171

172
    async def disconnect(self) -> bool:
173
        """Close API session"""
174
        try:
175
            self.session.close()
176
            self.authenticated = False
177
            return True
178
        except Exception as e:
179
            self.logger.error(f"Error closing API session: {e}")
180
            return False
181

182
    async def execute_operation(self, operation: str, parameters: Dict[str, Any]) -> Any:
183
        """Execute API operation"""
184
        if not self.authenticated:
185
            raise RuntimeError("API connection not authenticated")
186

187
        try:
188
            method = parameters.get('method', 'GET').upper()
189
            endpoint = parameters.get('endpoint')
190
            data = parameters.get('data')
191
            params = parameters.get('params')
192
            headers = parameters.get('headers', {})
193

194
            url = f"{self.base_url}{endpoint}"
195

196
            # Merge custom headers with session headers
197
            request_headers = {**self.session.headers, **headers}
198

199
            if method == 'GET':
200
                response = self.session.get(url, params=params, headers=request_headers)
201
            elif method == 'POST':
202
                response = self.session.post(url, json=data, params=params, headers=request_headers)
203
            elif method == 'PUT':
204
                response = self.session.put(url, json=data, params=params, headers=request_headers)
205
            elif method == 'DELETE':
206
                response = self.session.delete(url, params=params, headers=request_headers)
207
            else:
208
                raise ValueError(f"Unsupported HTTP method: {method}")
209

210
            response.raise_for_status()
211

212
            # Return JSON response if possible, otherwise return text
213
            try:
214
                return response.json()
215
            except:
216
                return response.text
217

218
        except Exception as e:
219
            self.logger.error(f"API operation failed: {e}")
220
            raise
221

222
class FileSystemConnector(IntegrationConnector):
223
    """Connector for file system operations"""
224

225
    def __init__(self, config: ConnectionConfig):
226
        super().__init__(config)
227
        self.base_path = config.host  # Use host field for base path
228

229
    async def connect(self) -> bool:
230
        """Verify file system access"""
231
        import os
232
        try:
233
            if os.path.exists(self.base_path) and os.access(self.base_path, os.R_OK | os.W_OK):
234
                self.logger.info(f"File system access verified: {self.base_path}")
235
                return True
236
            else:
237
                self.logger.error(f"No access to file system path: {self.base_path}")
238
                return False
239
        except Exception as e:
240
            self.logger.error(f"File system connection failed: {e}")
241
            return False
242

243
    async def disconnect(self) -> bool:
244
        """No special disconnect needed for file system"""
245
        return True
246

247
    async def execute_operation(self, operation: str, parameters: Dict[str, Any]) -> Any:
248
        """Execute file system operation"""
249
        import os
250
        import shutil
251
        import glob
252

253
        try:
254
            if operation.lower() == 'read_file':
255
                file_path = os.path.join(self.base_path, parameters.get('file_path'))
256
                with open(file_path, 'r', encoding=parameters.get('encoding', 'utf-8')) as f:
257
                    return f.read()
258

259
            elif operation.lower() == 'write_file':
260
                file_path = os.path.join(self.base_path, parameters.get('file_path'))
261
                content = parameters.get('content')
262

263
                # Create directory if it doesn't exist
264
                os.makedirs(os.path.dirname(file_path), exist_ok=True)
265

266
                with open(file_path, 'w', encoding=parameters.get('encoding', 'utf-8')) as f:
267
                    f.write(content)
268
                return f"File written: {file_path}"
269

270
            elif operation.lower() == 'list_files':
271
                pattern = parameters.get('pattern', '*')
272
                search_path = os.path.join(self.base_path, pattern)
273
                return glob.glob(search_path)
274

275
            elif operation.lower() == 'copy_file':
276
                source = os.path.join(self.base_path, parameters.get('source'))
277
                destination = os.path.join(self.base_path, parameters.get('destination'))
278
                shutil.copy2(source, destination)
279
                return f"File copied: {source} -> {destination}"
280

281
            elif operation.lower() == 'delete_file':
282
                file_path = os.path.join(self.base_path, parameters.get('file_path'))
283
                os.remove(file_path)
284
                return f"File deleted: {file_path}"
285

286
            else:
287
                raise ValueError(f"Unsupported file operation: {operation}")
288

289
        except Exception as e:
290
            self.logger.error(f"File system operation failed: {e}")
291
            raise
292

293
class IntegrationManager:
294
    """Manages multiple integration connectors"""
295

296
    def __init__(self):
297
        self.connectors: Dict[str, IntegrationConnector] = {}
298
        self.logger = logging.getLogger(__name__)
299

300
    def register_connector(self, name: str, connector: IntegrationConnector) -> bool:
301
        """Register a new connector"""
302
        try:
303
            self.connectors[name] = connector
304
            self.logger.info(f"Registered connector: {name}")
305
            return True
306
        except Exception as e:
307
            self.logger.error(f"Failed to register connector {name}: {e}")
308
            return False
309

310
    async def connect_all(self) -> Dict[str, bool]:
311
        """Connect to all registered systems"""
312
        results = {}
313
        for name, connector in self.connectors.items():
314
            try:
315
                results[name] = await connector.connect()
316
            except Exception as e:
317
                self.logger.error(f"Failed to connect to {name}: {e}")
318
                results[name] = False
319
        return results
320

321
    async def disconnect_all(self) -> Dict[str, bool]:
322
        """Disconnect from all systems"""
323
        results = {}
324
        for name, connector in self.connectors.items():
325
            try:
326
                results[name] = await connector.disconnect()
327
            except Exception as e:
328
                self.logger.error(f"Failed to disconnect from {name}: {e}")
329
                results[name] = False
330
        return results
331

332
    async def execute_operation(self, connector_name: str, operation: str, parameters: Dict[str, Any]) -> Any:
333
        """Execute operation on specified connector"""
334
        if connector_name not in self.connectors:
335
            raise ValueError(f"Connector {connector_name} not found")
336

337
        connector = self.connectors[connector_name]
338
        return await connector.execute_operation(operation, parameters)
339

340
# Example usage of integration framework
341
async def setup_integration_example():
342
    """Example of setting up multiple integrations"""
343

344
    manager = IntegrationManager()
345

346
    # Database connector
347
    db_config = ConnectionConfig(
348
        connection_type='postgresql',
349
        host='localhost',
350
        port=5432,
351
        username='dbuser',
352
        password='dbpass',
353
        database='customer_db'
354
    )
355
    db_connector = DatabaseConnector(db_config)
356
    manager.register_connector('customer_db', db_connector)
357

358
    # API connector
359
    api_config = ConnectionConfig(
360
        connection_type='rest_api',
361
        host='api.company.com',
362
        port=443,
363
        username='api_user',
364
        password='api_key',
365
        additional_params={'auth_endpoint': '/oauth/token'}
366
    )
367
    api_connector = APIConnector(api_config)
368
    manager.register_connector('crm_api', api_connector)
369

370
    # File system connector
371
    fs_config = ConnectionConfig(
372
        connection_type='filesystem',
373
        host='/data/shared',
374
        port=0,
375
        username='',
376
        password=''
377
    )
378
    fs_connector = FileSystemConnector(fs_config)
379
    manager.register_connector('shared_storage', fs_connector)
380

381
    # Connect to all systems
382
    connection_results = await manager.connect_all()
383
    print("Connection results:", connection_results)
384

385
    # Example operations
386
    try:
387
        # Database operation
388
        customers = await manager.execute_operation(
389
            'customer_db',
390
            'select',
391
            {
392
                'query': 'SELECT * FROM customers WHERE created_date > %s',
393
                'params': ['2024-01-01']
394
            }
395
        )
396
        print(f"Retrieved {len(customers)} customers from database")
397

398
        # API operation
399
        api_response = await manager.execute_operation(
400
            'crm_api',
401
            'api_call',
402
            {
403
                'method': 'GET',
404
                'endpoint': '/customers',
405
                'params': {'limit': 100}
406
            }
407
        )
408
        print(f"API response: {api_response}")
409

410
        # File operation
411
        files = await manager.execute_operation(
412
            'shared_storage',
413
            'list_files',
414
            {'pattern': '*.csv'}
415
        )
416
        print(f"Found {len(files)} CSV files")
417

418
    except Exception as e:
419
        print(f"Operation failed: {e}")
420

421
    # Disconnect from all systems
422
    disconnect_results = await manager.disconnect_all()
423
    print("Disconnect results:", disconnect_results)
424

425
# Run the integration example
426
# asyncio.run(setup_integration_example())

The complexity of modern enterprise automation systems requires careful consideration of various factors, from basic task execution to sophisticated error handling and recovery mechanisms. When implementing these systems, it’s crucial to understand that effective automation solutions must balance automation capabilities with human oversight, especially when dealing with critical business processes.

1
# Workflow validation constants
2
VALIDATION_CONFIG = {
3
    'timeout_seconds': 51,
4
    'validator_name': 'nt',
5
    'validation_mode': 'strict'
6
}
7

8
# Workflow approval settings
9
APPROVAL_SETTINGS = {
10
    'require_validation': True,
11
    'validation_timeout': f"{VALIDATION_CONFIG['timeout_seconds']}{VALIDATION_CONFIG['validator_name']}}}",
12
    'approval_required': True
13
}

Intelligent Document Processing#

One of the most transformative applications of enterprise automation is intelligent document processing, which combines optical character recognition (OCR), natural language processing (NLP), and machine learning to automate document-intensive workflows.

Advanced OCR and Document Analysis#

1
import cv2
2
import numpy as np
3
from PIL import Image
4
import pytesseract
5
import re
6
from typing import Dict, List, Tuple, Optional
7
import json
8
from dataclasses import dataclass
9
from datetime import datetime
10
import spacy
11
from transformers import pipeline
12

13
@dataclass
14
class DocumentRegion:
15
    x: int
16
    y: int
17
    width: int
18
    height: int
19
    text: str
20
    confidence: float
21
    region_type: str  # 'header', 'paragraph', 'table', 'signature', etc.
22

23
@dataclass
24
class ExtractedField:
25
    field_name: str
26
    value: str
27
    confidence: float
28
    coordinates: Tuple[int, int, int, int]
29
    validation_status: str  # 'valid', 'invalid', 'requires_review'
30

31
@dataclass
32
class DocumentAnalysisResult:
33
    document_id: str
34
    document_type: str
35
    processing_timestamp: datetime
36
    extracted_text: str
37
    structured_data: Dict[str, Any]
38
    extracted_fields: List[ExtractedField]
39
    regions: List[DocumentRegion]
40
    quality_score: float
41
    processing_metadata: Dict[str, Any]
42

43
class IntelligentDocumentProcessor:
44
    def __init__(self):
45
        # Load NLP models
46
        self.nlp = spacy.load("en_core_web_sm")
47
        self.ner_pipeline = pipeline("ner", aggregation_strategy="simple")
48

49
        # Document type classifiers
50
        self.document_classifier = pipeline("text-classification",
51
                                           model="microsoft/DialoGPT-medium")
52

53
        # Field extraction patterns
54
        self.field_patterns = {
55
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
56
            'phone': r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
57
            'date': r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b',
58
            'ssn': r'\b\d{3}-\d{2}-\d{4}\b',
59
            'currency': r'\$\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?',
60
            'invoice_number': r'(?:invoice|inv)[#\s]*(\w+\d+)',
61
            'po_number': r'(?:po|purchase order)[#\s]*(\w+\d+)'
62
        }
63

64
        # Document type templates
65
        self.document_templates = {
66
            'invoice': {
67
                'required_fields': ['invoice_number', 'date', 'amount', 'vendor'],
68
                'optional_fields': ['po_number', 'tax_amount', 'due_date'],
69
                'validation_rules': {
70
                    'amount': lambda x: self._validate_currency(x),
71
                    'date': lambda x: self._validate_date(x)
72
                }
73
            },
74
            'contract': {
75
                'required_fields': ['parties', 'effective_date', 'term'],
76
                'optional_fields': ['termination_clause', 'governing_law'],
77
                'validation_rules': {
78
                    'effective_date': lambda x: self._validate_date(x)
79
                }
80
            },
81
            'receipt': {
82
                'required_fields': ['merchant', 'date', 'total_amount'],
83
                'optional_fields': ['tax_amount', 'items'],
84
                'validation_rules': {
85
                    'total_amount': lambda x: self._validate_currency(x)
86
                }
87
            }
88
        }
89

90
    async def process_document(self, image_path: str, document_id: str = None) -> DocumentAnalysisResult:
91
        """Process a document image and extract structured data"""
92

93
        if document_id is None:
94
            document_id = f"doc_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
95

96
        start_time = datetime.now()
97

98
        # Step 1: Image preprocessing
99
        processed_image = self._preprocess_image(image_path)
100

101
        # Step 2: OCR extraction
102
        extracted_text, regions = self._extract_text_with_regions(processed_image)
103

104
        # Step 3: Document classification
105
        document_type = self._classify_document(extracted_text)
106

107
        # Step 4: Field extraction
108
        extracted_fields = self._extract_fields(extracted_text, document_type, regions)
109

110
        # Step 5: Structure extraction
111
        structured_data = self._extract_structured_data(extracted_text, document_type)
112

113
        # Step 6: Validation
114
        validated_fields = self._validate_extracted_fields(extracted_fields, document_type)
115

116
        # Step 7: Quality assessment
117
        quality_score = self._assess_quality(extracted_text, validated_fields, regions)
118

119
        processing_time = (datetime.now() - start_time).total_seconds()
120

121
        return DocumentAnalysisResult(
122
            document_id=document_id,
123
            document_type=document_type,
124
            processing_timestamp=datetime.now(),
125
            extracted_text=extracted_text,
126
            structured_data=structured_data,
127
            extracted_fields=validated_fields,
128
            regions=regions,
129
            quality_score=quality_score,
130
            processing_metadata={
131
                'processing_time_seconds': processing_time,
132
                'image_path': image_path,
133
                'total_regions': len(regions),
134
                'total_fields_extracted': len(validated_fields)
135
            }
136
        )
137

138
    def _preprocess_image(self, image_path: str) -> np.ndarray:
139
        """Preprocess image for better OCR results"""
140

141
        # Load image
142
        image = cv2.imread(image_path)
143

144
        # Convert to grayscale
145
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
146

147
        # Noise reduction
148
        denoised = cv2.fastNlMeansDenoising(gray)
149

150
        # Enhance contrast
151
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
152
        enhanced = clahe.apply(denoised)
153

154
        # Binarization
155
        _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
156

157
        # Deskewing
158
        coords = np.column_stack(np.where(binary > 0))
159
        angle = cv2.minAreaRect(coords)[-1]
160
        if angle < -45:
161
            angle = -(90 + angle)
162
        else:
163
            angle = -angle
164

165
        (h, w) = binary.shape[:2]
166
        center = (w // 2, h // 2)
167
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
168
        rotated = cv2.warpAffine(binary, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
169

170
        return rotated
171

172
    def _extract_text_with_regions(self, image: np.ndarray) -> Tuple[str, List[DocumentRegion]]:
173
        """Extract text and identify regions using OCR"""
174

175
        # Use pytesseract with detailed output
176
        custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!@#$%^&*()_+-=[]{}|;:,.<>?/~` '
177

178
        # Get detailed OCR data
179
        ocr_data = pytesseract.image_to_data(image, config=custom_config, output_type=pytesseract.Output.DICT)
180

181
        # Extract full text
182
        full_text = pytesseract.image_to_string(image, config=custom_config)
183

184
        # Process regions
185
        regions = []
186
        n_boxes = len(ocr_data['level'])
187

188
        for i in range(n_boxes):
189
            if int(ocr_data['conf'][i]) > 30:  # Confidence threshold
190
                x = ocr_data['left'][i]
191
                y = ocr_data['top'][i]
192
                w = ocr_data['width'][i]
193
                h = ocr_data['height'][i]
194
                text = ocr_data['text'][i].strip()
195
                confidence = float(ocr_data['conf'][i]) / 100.0
196

197
                if text:  # Only include non-empty text
198
                    region_type = self._classify_region(text, x, y, w, h, image.shape)
199

200
                    regions.append(DocumentRegion(
201
                        x=x, y=y, width=w, height=h,
202
                        text=text, confidence=confidence,
203
                        region_type=region_type
204
                    ))
205

206
        return full_text, regions
207

208
    def _classify_document(self, text: str) -> str:
209
        """Classify document type based on content"""
210

211
        text_lower = text.lower()
212

213
        # Simple rule-based classification
214
        if any(keyword in text_lower for keyword in ['invoice', 'bill to', 'amount due', 'payment terms']):
215
            return 'invoice'
216
        elif any(keyword in text_lower for keyword in ['agreement', 'contract', 'party', 'whereas']):
217
            return 'contract'
218
        elif any(keyword in text_lower for keyword in ['receipt', 'thank you', 'total', 'cashier']):
219
            return 'receipt'
220
        elif any(keyword in text_lower for keyword in ['statement', 'account', 'balance', 'transaction']):
221
            return 'statement'
222
        elif any(keyword in text_lower for keyword in ['resume', 'curriculum vitae', 'experience', 'education']):
223
            return 'resume'
224
        else:
225
            return 'unknown'
226

227
    def _classify_region(self, text: str, x: int, y: int, w: int, h: int, image_shape: Tuple) -> str:
228
        """Classify the type of text region"""
229

230
        image_height, image_width = image_shape[:2]
231

232
        # Position-based classification
233
        if y < image_height * 0.15:  # Top 15% of image
234
            return 'header'
235
        elif y > image_height * 0.85:  # Bottom 15% of image
236
            return 'footer'
237
        elif h > image_height * 0.1:  # Large height regions
238
            return 'paragraph'
239
        elif re.match(r'^\d+[.,]\d+$', text.strip()):  # Numbers
240
            return 'amount'
241
        elif re.match(r'^\d{1,2}[/-]\d{1,2}[/-]\d{2,4}$', text.strip()):  # Dates
242
            return 'date'
243
        elif len(text.split()) == 1 and text.isupper():  # Single uppercase words
244
            return 'label'
245
        else:
246
            return 'text'
247

248
    def _extract_fields(self, text: str, document_type: str, regions: List[DocumentRegion]) -> List[ExtractedField]:
249
        """Extract specific fields based on document type"""
250

251
        extracted_fields = []
252

253
        # Use regex patterns for common fields
254
        for field_name, pattern in self.field_patterns.items():
255
            matches = re.finditer(pattern, text, re.IGNORECASE)
256
            for match in matches:
257
                # Find corresponding region
258
                matching_region = self._find_matching_region(match.group(), regions)
259

260
                if matching_region:
261
                    extracted_fields.append(ExtractedField(
262
                        field_name=field_name,
263
                        value=match.group(),
264
                        confidence=matching_region.confidence,
265
                        coordinates=(matching_region.x, matching_region.y,
266
                                   matching_region.width, matching_region.height),
267
                        validation_status='pending'
268
                    ))
269

270
        # Document-specific field extraction
271
        if document_type in self.document_templates:
272
            template = self.document_templates[document_type]
273

274
            # Use NLP for named entity recognition
275
            doc = self.nlp(text)
276
            for ent in doc.ents:
277
                field_name = self._map_entity_to_field(ent.label_, document_type)
278
                if field_name:
279
                    matching_region = self._find_matching_region(ent.text, regions)
280

281
                    if matching_region:
282
                        extracted_fields.append(ExtractedField(
283
                            field_name=field_name,
284
                            value=ent.text,
285
                            confidence=matching_region.confidence,
286
                            coordinates=(matching_region.x, matching_region.y,
287
                                       matching_region.width, matching_region.height),
288
                            validation_status='pending'
289
                        ))
290

291
        return extracted_fields
292

293
    def _find_matching_region(self, text: str, regions: List[DocumentRegion]) -> Optional[DocumentRegion]:
294
        """Find the region that contains the specified text"""
295
        for region in regions:
296
            if text.lower() in region.text.lower():
297
                return region
298
        return None
299

300
    def _map_entity_to_field(self, entity_label: str, document_type: str) -> Optional[str]:
301
        """Map NLP entity labels to document fields"""
302
        mapping = {
303
            'PERSON': 'person_name',
304
            'ORG': 'organization',
305
            'DATE': 'date',
306
            'MONEY': 'amount',
307
            'GPE': 'location',
308
            'CARDINAL': 'number'
309
        }
310

311
        base_field = mapping.get(entity_label)
312

313
        # Document-specific mapping
314
        if document_type == 'invoice' and base_field == 'organization':
315
            return 'vendor'
316
        elif document_type == 'contract' and base_field == 'person_name':
317
            return 'party'
318

319
        return base_field
320

321
    def _extract_structured_data(self, text: str, document_type: str) -> Dict[str, Any]:
322
        """Extract structured data specific to document type"""
323

324
        structured_data = {
325
            'document_type': document_type,
326
            'extraction_timestamp': datetime.now().isoformat()
327
        }
328

329
        if document_type == 'invoice':
330
            structured_data.update(self._extract_invoice_data(text))
331
        elif document_type == 'contract':
332
            structured_data.update(self._extract_contract_data(text))
333
        elif document_type == 'receipt':
334
            structured_data.update(self._extract_receipt_data(text))
335

336
        return structured_data
337

338
    def _extract_invoice_data(self, text: str) -> Dict[str, Any]:
339
        """Extract invoice-specific structured data"""
340

341
        # Extract line items
342
        line_items = []
343
        lines = text.split('\n')
344

345
        for line in lines:
346
            # Look for patterns like "Description Qty Price Amount"
347
            if re.search(r'\d+\.\d{2}', line) and len(line.split()) >= 3:
348
                parts = line.split()
349
                if len(parts) >= 4:
350
                    try:
351
                        qty = float(parts[-3])
352
                        price = float(parts[-2].replace('$', '').replace(',', ''))
353
                        amount = float(parts[-1].replace('$', '').replace(',', ''))
354
                        description = ' '.join(parts[:-3])
355

356
                        line_items.append({
357
                            'description': description,
358
                            'quantity': qty,
359
                            'unit_price': price,
360
                            'total_amount': amount
361
                        })
362
                    except ValueError:
363
                        continue
364

365
        # Extract totals
366
        subtotal = self._extract_amount_by_label(text, ['subtotal', 'sub total'])
367
        tax_amount = self._extract_amount_by_label(text, ['tax', 'sales tax', 'vat'])
368
        total_amount = self._extract_amount_by_label(text, ['total', 'amount due', 'balance due'])
369

370
        return {
371
            'line_items': line_items,
372
            'subtotal': subtotal,
373
            'tax_amount': tax_amount,
374
            'total_amount': total_amount,
375
            'item_count': len(line_items)
376
        }
377

378
    def _extract_contract_data(self, text: str) -> Dict[str, Any]:
379
        """Extract contract-specific structured data"""
380

381
        # Extract parties
382
        parties = []
383
        doc = self.nlp(text)
384

385
        for ent in doc.ents:
386
            if ent.label_ in ['PERSON', 'ORG'] and 'party' in text[max(0, ent.start_char-50):ent.end_char+50].lower():
387
                parties.append(ent.text)
388

389
        # Extract key terms
390
        effective_date = self._extract_date_by_label(text, ['effective', 'start', 'commencement'])
391
        expiration_date = self._extract_date_by_label(text, ['expiration', 'end', 'termination'])
392

393
        return {
394
            'parties': list(set(parties)),
395
            'effective_date': effective_date,
396
            'expiration_date': expiration_date,
397
            'party_count': len(set(parties))
398
        }
399

400
    def _extract_receipt_data(self, text: str) -> Dict[str, Any]:
401
        """Extract receipt-specific structured data"""
402

403
        # Extract items
404
        items = []
405
        lines = text.split('\n')
406

407
        for line in lines:
408
            if '$' in line and not any(keyword in line.lower() for keyword in ['tax', 'total', 'subtotal']):
409
                # Simple item extraction
410
                amount_match = re.search(r'\$\d+\.\d{2}', line)
411
                if amount_match:
412
                    description = line.replace(amount_match.group(), '').strip()
413
                    amount = float(amount_match.group().replace('$', ''))
414

415
                    items.append({
416
                        'description': description,
417
                        'amount': amount
418
                    })
419

420
        # Extract merchant info
421
        merchant = self._extract_merchant_info(text)
422

423
        return {
424
            'items': items,
425
            'merchant': merchant,
426
            'item_count': len(items)
427
        }
428

429
    def _extract_amount_by_label(self, text: str, labels: List[str]) -> Optional[float]:
430
        """Extract monetary amount near specified labels"""
431
        for label in labels:
432
            pattern = f'{label}[:\s]*\$?(\d+(?:,\d{{3}})*(?:\.\d{{2}})?)'
433
            match = re.search(pattern, text, re.IGNORECASE)
434
            if match:
435
                return float(match.group(1).replace(',', ''))
436
        return None
437

438
    def _extract_date_by_label(self, text: str, labels: List[str]) -> Optional[str]:
439
        """Extract date near specified labels"""
440
        for label in labels:
441
            # Look for date within 50 characters after the label
442
            label_pos = text.lower().find(label.lower())
443
            if label_pos != -1:
444
                search_text = text[label_pos:label_pos+100]
445
                date_match = re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', search_text)
446
                if date_match:
447
                    return date_match.group()
448
        return None
449

450
    def _extract_merchant_info(self, text: str) -> Optional[str]:
451
        """Extract merchant information from receipt"""
452
        lines = text.split('\n')
453

454
        # Usually merchant info is at the top
455
        for line in lines[:5]:
456
            if line.strip() and not any(char.isdigit() for char in line):
457
                # Skip lines that are likely headers or labels
458
                if not any(keyword in line.lower() for keyword in ['receipt', 'store', 'location']):
459
                    return line.strip()
460

461
        return None
462

463
    def _validate_extracted_fields(self, fields: List[ExtractedField], document_type: str) -> List[ExtractedField]:
464
        """Validate extracted fields against business rules"""
465

466
        validated_fields = []
467

468
        for field in fields:
469
            # Apply validation rules
470
            if document_type in self.document_templates:
471
                template = self.document_templates[document_type]
472
                validation_rules = template.get('validation_rules', {})
473

474
                if field.field_name in validation_rules:
475
                    validation_func = validation_rules[field.field_name]
476
                    is_valid = validation_func(field.value)
477
                    field.validation_status = 'valid' if is_valid else 'invalid'
478
                else:
479
                    field.validation_status = 'valid'  # No specific rule
480
            else:
481
                field.validation_status = 'requires_review'  # Unknown document type
482

483
            validated_fields.append(field)
484

485
        return validated_fields
486

487
    def _validate_currency(self, value: str) -> bool:
488
        """Validate currency format"""
489
        try:
490
            amount = float(re.sub(r'[^\d.]', '', value))
491
            return amount >= 0
492
        except:
493
            return False
494

495
    def _validate_date(self, value: str) -> bool:
496
        """Validate date format"""
497
        date_patterns = [
498
            r'\d{1,2}[/-]\d{1,2}[/-]\d{4}',
499
            r'\d{4}[/-]\d{1,2}[/-]\d{1,2}',
500
            r'\d{1,2}[/-]\d{1,2}[/-]\d{2}'
501
        ]
502

503
        return any(re.match(pattern, value) for pattern in date_patterns)
504

505
    def _assess_quality(self, text: str, fields: List[ExtractedField], regions: List[DocumentRegion]) -> float:
506
        """Assess the quality of document processing"""
507

508
        # Factors for quality assessment
509
        text_length_score = min(len(text) / 1000, 1.0)  # Normalize to 0-1
510

511
        # Field extraction completeness
512
        valid_fields = [f for f in fields if f.validation_status == 'valid']
513
        field_score = len(valid_fields) / max(len(fields), 1)
514

515
        # OCR confidence
516
        if regions:
517
            avg_confidence = sum(r.confidence for r in regions) / len(regions)
518
        else:
519
            avg_confidence = 0.0
520

521
        # Combined quality score
522
        quality_score = (text_length_score * 0.3 + field_score * 0.4 + avg_confidence * 0.3)
523

524
        return min(quality_score, 1.0)
525

526
# Usage example
527
async def process_invoice_example():
528
    """Example of processing an invoice document"""
529

530
    processor = IntelligentDocumentProcessor()
531

532
    # Process document
533
    result = await processor.process_document('invoice_sample.png', 'invoice_001')
534

535
    print(f"Document Type: {result.document_type}")
536
    print(f"Quality Score: {result.quality_score:.2f}")
537
    print(f"Processing Time: {result.processing_metadata['processing_time_seconds']:.2f}s")
538

539
    print("\nExtracted Fields:")
540
    for field in result.extracted_fields:
541
        print(f"  {field.field_name}: {field.value} (confidence: {field.confidence:.2f}, status: {field.validation_status})")
542

543
    print("\nStructured Data:")
544
    print(json.dumps(result.structured_data, indent=2, default=str))
545

546
    return result
547

548
# Run the example
549
# result = asyncio.run(process_invoice_example())

This comprehensive approach to intelligent document processing demonstrates how modern automation systems can handle complex, unstructured data and extract meaningful information with high accuracy. The integration of multiple AI technologies—OCR, NLP, and machine learning—creates a powerful foundation for automating document-intensive business processes.

The key to successful implementation lies in understanding that automation is not just about replacing human tasks, but about augmenting human capabilities and creating more efficient, accurate, and scalable business processes. By combining technical sophistication with practical business requirements, organizations can build automation systems that deliver genuine value and competitive advantage.