xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision b2c7de82305294ddf7dd438eecf63f281ef33eba)
1 /*
2  * Copyright 2006-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <KernelExport.h>
23 #include <Select.h>
24 
25 #include <net_buffer.h>
26 #include <net_datalink.h>
27 #include <net_stat.h>
28 #include <NetBufferUtilities.h>
29 #include <NetUtilities.h>
30 
31 #include <lock.h>
32 #include <tracing.h>
33 #include <util/AutoLock.h>
34 #include <util/khash.h>
35 #include <util/list.h>
36 
37 #include "EndpointManager.h"
38 
39 
40 // References:
41 //  - RFC 793 - Transmission Control Protocol
42 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
43 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
44 //
45 // Things this implementation currently doesn't implement:
46 //	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
47 //	  RFC 2001, RFC 2581, RFC 3042
48 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
49 //	- Explicit Congestion Notification (ECN), RFC 3168
50 //	- SYN-Cache
51 //	- TCP Extensions for High Performance, RFC 1323
52 //	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
53 //	- Forward RTO-Recovery, RFC 4138
54 //	- Time-Wait hash instead of keeping sockets alive
55 
56 #define PrintAddress(address) \
57 	AddressString(Domain(), address, true).Data()
58 
59 //#define TRACE_TCP
60 //#define PROBE_TCP
61 
62 #ifdef TRACE_TCP
63 // the space before ', ##args' is important in order for this to work with cpp 2.95
64 #	define TRACE(format, args...)	dprintf("%ld: TCP [%llu] %p (%12s) " \
65 		format "\n", find_thread(NULL), system_time(), this, \
66 		name_for_state(fState) , ##args)
67 #else
68 #	define TRACE(args...)			do { } while (0)
69 #endif
70 
71 #ifdef PROBE_TCP
72 #	define PROBE(buffer, window) \
73 	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
74 		system_time(), PrintAddress(buffer->source), \
75 		PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \
76 		(uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \
77 		window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \
78 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
79 #else
80 #	define PROBE(buffer, window)	do { } while (0)
81 #endif
82 
83 #if TCP_TRACING
84 namespace TCPTracing {
85 
86 class Receive : public AbstractTraceEntry {
87 public:
88 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
89 			net_buffer* buffer)
90 		:
91 		fEndpoint(endpoint),
92 		fBuffer(buffer),
93 		fBufferSize(buffer->size),
94 		fSequence(segment.sequence),
95 		fAcknowledge(segment.acknowledge),
96 		fWindow(window),
97 		fState(endpoint->State()),
98 		fFlags(segment.flags)
99 	{
100 		Initialized();
101 	}
102 
103 	virtual void AddDump(TraceOutput& out)
104 	{
105 		out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, "
106 			"seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState),
107 			fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
108 	}
109 
110 protected:
111 	TCPEndpoint*	fEndpoint;
112 	net_buffer*		fBuffer;
113 	uint32			fBufferSize;
114 	uint32			fSequence;
115 	uint32			fAcknowledge;
116 	uint32			fWindow;
117 	tcp_state		fState;
118 	uint8			fFlags;
119 };
120 
121 class Send : public AbstractTraceEntry {
122 public:
123 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
124 			uint32 firstSequence, uint32 lastSequence)
125 		:
126 		fEndpoint(endpoint),
127 		fBuffer(buffer),
128 		fBufferSize(buffer->size),
129 		fSequence(segment.sequence),
130 		fAcknowledge(segment.acknowledge),
131 		fFirstSequence(firstSequence),
132 		fLastSequence(lastSequence),
133 		fState(endpoint->State()),
134 		fFlags(segment.flags)
135 	{
136 		Initialized();
137 	}
138 
139 	virtual void AddDump(TraceOutput& out)
140 	{
141 		out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, "
142 			"seq %lu, ack %lu, first %lu, last %lu",
143 			fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags,
144 			fSequence, fAcknowledge, fFirstSequence, fLastSequence);
145 	}
146 
147 protected:
148 	TCPEndpoint*	fEndpoint;
149 	net_buffer*		fBuffer;
150 	uint32			fBufferSize;
151 	uint32			fSequence;
152 	uint32			fAcknowledge;
153 	uint32			fFirstSequence;
154 	uint32			fLastSequence;
155 	tcp_state		fState;
156 	uint8			fFlags;
157 };
158 
159 class State : public AbstractTraceEntry {
160 public:
161 	State(TCPEndpoint* endpoint)
162 		:
163 		fEndpoint(endpoint),
164 		fState(endpoint->State())
165 	{
166 		Initialized();
167 	}
168 
169 	virtual void AddDump(TraceOutput& out)
170 	{
171 		out.Print("tcp:%p (%12s) state change", fEndpoint,
172 			name_for_state(fState));
173 	}
174 
175 protected:
176 	TCPEndpoint*	fEndpoint;
177 	tcp_state		fState;
178 };
179 
180 class Timer : public AbstractTraceEntry {
181 public:
182 	Timer(TCPEndpoint* endpoint, const char* which)
183 		:
184 		fEndpoint(endpoint),
185 		fWhich(which),
186 		fState(endpoint->State())
187 	{
188 		Initialized();
189 	}
190 
191 	virtual void AddDump(TraceOutput& out)
192 	{
193 		out.Print("tcp:%p (%12s) %s timer", fEndpoint,
194 			name_for_state(fState), fWhich);
195 	}
196 
197 protected:
198 	TCPEndpoint*	fEndpoint;
199 	const char*		fWhich;
200 	tcp_state		fState;
201 };
202 
203 }	// namespace TCPTracing
204 
205 #	define T(x)	new(std::nothrow) TCPTracing::x
206 #else
207 #	define T(x)
208 #endif	// TCP_TRACING
209 
210 // Initial estimate for packet round trip time (RTT)
211 #define TCP_INITIAL_RTT		2000000
212 
213 // constants for the fFlags field
214 enum {
215 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
216 	FLAG_OPTION_TIMESTAMP		= 0x02,
217 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
218 	//       That is, what is expected from accept() after a shutdown()
219 	//       is performed on a listen()ing socket.
220 	FLAG_NO_RECEIVE				= 0x04,
221 	FLAG_CLOSED					= 0x08,
222 	FLAG_DELETE_ON_CLOSE		= 0x10,
223 };
224 
225 
226 static const int kTimestampFactor = 1024;
227 
228 
229 static inline bigtime_t
230 absolute_timeout(bigtime_t timeout)
231 {
232 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
233 		return timeout;
234 
235 	return timeout + system_time();
236 }
237 
238 
239 static inline status_t
240 posix_error(status_t error)
241 {
242 	if (error == B_TIMED_OUT)
243 		return B_WOULD_BLOCK;
244 
245 	return error;
246 }
247 
248 
249 static inline bool
250 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
251 	uint32 receiveWindow)
252 {
253 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
254 }
255 
256 
257 static inline bool
258 segment_in_sequence(const tcp_segment_header& segment, int size,
259 	const tcp_sequence& receiveNext, uint32 receiveWindow)
260 {
261 	tcp_sequence sequence(segment.sequence);
262 	if (size == 0) {
263 		if (receiveWindow == 0)
264 			return sequence == receiveNext;
265 		return in_window(sequence, receiveNext, receiveWindow);
266 	} else {
267 		if (receiveWindow == 0)
268 			return false;
269 		return in_window(sequence, receiveNext, receiveWindow)
270 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
271 	}
272 }
273 
274 
275 static inline bool
276 is_writable(tcp_state state)
277 {
278 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
279 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
280 }
281 
282 
283 static inline uint32 tcp_now()
284 {
285 	return system_time() / kTimestampFactor;
286 }
287 
288 
289 static inline uint32 tcp_diff_timestamp(uint32 base)
290 {
291 	uint32 now = tcp_now();
292 
293 	if (now > base)
294 		return now - base;
295 
296 	return now + UINT_MAX - base;
297 }
298 
299 
300 static inline bool
301 state_needs_finish(int32 state)
302 {
303 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
304 		|| state == FINISH_SENT || state == CLOSING;
305 }
306 
307 
308 //	#pragma mark -
309 
310 
311 WaitList::WaitList(const char* name)
312 {
313 	fCondition = 0;
314 	fSem = create_sem(0, name);
315 }
316 
317 
318 WaitList::~WaitList()
319 {
320 	delete_sem(fSem);
321 }
322 
323 
324 status_t
325 WaitList::InitCheck() const
326 {
327 	return fSem;
328 }
329 
330 
331 status_t
332 WaitList::Wait(MutexLocker& locker, bigtime_t timeout)
333 {
334 	locker.Unlock();
335 
336 	status_t status = B_OK;
337 
338 	while (!atomic_test_and_set(&fCondition, 0, 1)) {
339 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
340 			timeout);
341 		if (status != B_OK)
342 			break;
343 	}
344 
345 	locker.Lock();
346 	return status;
347 }
348 
349 
350 void
351 WaitList::Signal()
352 {
353 	atomic_or(&fCondition, 1);
354 #ifdef __HAIKU__
355 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL);
356 #else
357 	int32 count;
358 	if (get_sem_count(fSem, &count) == B_OK && count < 0)
359 		release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE);
360 #endif
361 }
362 
363 
364 //	#pragma mark -
365 
366 
367 TCPEndpoint::TCPEndpoint(net_socket* socket)
368 	: ProtocolSocket(socket),
369 	fManager(NULL),
370 	fReceiveList("tcp receive"),
371 	fSendList("tcp send"),
372 	fOptions(0),
373 	fSendWindowShift(0),
374 	fReceiveWindowShift(0),
375 	fSendUnacknowledged(0),
376 	fSendNext(0),
377 	fSendMax(0),
378 	fSendUrgentOffset(0),
379 	fSendWindow(0),
380 	fSendMaxWindow(0),
381 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
382 	fSendQueue(socket->send.buffer_size),
383 	fInitialSendSequence(0),
384 	fDuplicateAcknowledgeCount(0),
385 	fRoute(NULL),
386 	fReceiveNext(0),
387 	fReceiveMaxAdvertised(0),
388 	fReceiveWindow(socket->receive.buffer_size),
389 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
390 	fReceiveQueue(socket->receive.buffer_size),
391 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
392 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
393 	fRetransmitTimeout(TCP_INITIAL_RTT),
394 	fReceivedTimestamp(0),
395 	fCongestionWindow(0),
396 	fSlowStartThreshold(0),
397 	fState(CLOSED),
398 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
399 {
400 	// TODO: to be replaced with a real read/write locking strategy!
401 	mutex_init(&fLock, "tcp lock");
402 
403 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
404 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
405 		this);
406 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
407 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
408 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this);
409 }
410 
411 
412 TCPEndpoint::~TCPEndpoint()
413 {
414 	mutex_lock(&fLock);
415 
416 	_CancelConnectionTimers();
417 	gStackModule->cancel_timer(&fTimeWaitTimer);
418 
419 	if (fManager != NULL) {
420 		fManager->Unbind(this);
421 		put_endpoint_manager(fManager);
422 	}
423 
424 	mutex_destroy(&fLock);
425 
426 	// TODO: we need to wait for all timers to return
427 	//_WaitForTimers();
428 }
429 
430 
431 status_t
432 TCPEndpoint::InitCheck() const
433 {
434 	if (fReceiveList.InitCheck() < B_OK)
435 		return fReceiveList.InitCheck();
436 
437 	if (fSendList.InitCheck() < B_OK)
438 		return fSendList.InitCheck();
439 
440 	return B_OK;
441 }
442 
443 
444 //	#pragma mark - protocol API
445 
446 
447 status_t
448 TCPEndpoint::Open()
449 {
450 	TRACE("Open()");
451 
452 	status_t status = ProtocolSocket::Open();
453 	if (status < B_OK)
454 		return status;
455 
456 	fManager = get_endpoint_manager(Domain());
457 	if (fManager == NULL)
458 		return EAFNOSUPPORT;
459 
460 	return B_OK;
461 }
462 
463 
464 status_t
465 TCPEndpoint::Close()
466 {
467 	TRACE("Close()");
468 
469 	MutexLocker lock(fLock);
470 
471 	if (fState == LISTEN)
472 		delete_sem(fAcceptSemaphore);
473 
474 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
475 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
476 		fState = CLOSED;
477 		T(State(this));
478 		return B_OK;
479 	}
480 
481 	status_t status = _Disconnect(true);
482 	if (status != B_OK)
483 		return status;
484 
485 	if (socket->options & SO_LINGER) {
486 		TRACE("Close(): Lingering for %i secs", socket->linger);
487 
488 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
489 
490 		while (fSendQueue.Used() > 0) {
491 			status = fSendList.Wait(lock, maximum);
492 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
493 				break;
494 			else if (status < B_OK)
495 				return status;
496 		}
497 
498 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
499 			fSendQueue.Used());
500 	}
501 
502 	return B_OK;
503 }
504 
505 
506 status_t
507 TCPEndpoint::Free()
508 {
509 	TRACE("Free()");
510 
511 	MutexLocker _(fLock);
512 
513 	if (fState <= SYNCHRONIZE_SENT)
514 		return B_OK;
515 
516 	// we are only interested in the timer, not in changing state
517 	_EnterTimeWait();
518 
519 	fFlags |= FLAG_CLOSED;
520 	if ((fFlags & FLAG_DELETE_ON_CLOSE) != 0)
521 		return B_OK;
522 
523 	return B_BUSY;
524 		// we'll be freed later when the 2MSL timer expires
525 }
526 
527 
528 /*!	Creates and sends a synchronize packet to /a address, and then waits
529 	until the connection has been established or refused.
530 */
531 status_t
532 TCPEndpoint::Connect(const sockaddr* address)
533 {
534 	TRACE("Connect() on address %s", PrintAddress(address));
535 
536 	MutexLocker locker(fLock);
537 
538 	if (gStackModule->is_restarted_syscall()) {
539 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
540 		status_t status = _WaitForEstablished(locker, timeout);
541 		TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
542 			strerror(status), timeout);
543 		return posix_error(status);
544 	}
545 
546 	// Can only call connect() from CLOSED or LISTEN states
547 	// otherwise endpoint is considered already connected
548 	if (fState == LISTEN) {
549 		// this socket is about to connect; remove pending connections in the backlog
550 		gSocketModule->set_max_backlog(socket, 0);
551 	} else if (fState == ESTABLISHED) {
552 		return EISCONN;
553 	} else if (fState != CLOSED)
554 		return EINPROGRESS;
555 
556 	// TODO: this is IPv4 specific, and doesn't belong here!
557 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
558 	sockaddr_in _address;
559 	if (((sockaddr_in*)address)->sin_addr.s_addr == INADDR_ANY) {
560 		memcpy(&_address, address, sizeof(sockaddr_in));
561 		_address.sin_len = sizeof(sockaddr_in);
562 		_address.sin_addr.s_addr = INADDR_LOOPBACK;
563 		address = (sockaddr*)&_address;
564 	}
565 
566 	status_t status = _PrepareSendPath(address);
567 	if (status < B_OK)
568 		return status;
569 
570 	TRACE("  Connect(): starting 3-way handshake...");
571 
572 	fState = SYNCHRONIZE_SENT;
573 	T(State(this));
574 
575 	// send SYN
576 	status = _SendQueued();
577 	if (status != B_OK) {
578 		_Close();
579 		return status;
580 	}
581 
582 	// If we are running over Loopback, after _SendQueued() returns we
583 	// may be in ESTABLISHED already.
584 	if (fState == ESTABLISHED) {
585 		TRACE("  Connect() completed after _SendQueued()");
586 		return B_OK;
587 	}
588 
589 	// wait until 3-way handshake is complete (if needed)
590 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
591 	if (timeout == 0) {
592 		// we're a non-blocking socket
593 		TRACE("  Connect() delayed, return EINPROGRESS");
594 		return EINPROGRESS;
595 	}
596 
597 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
598 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
599 
600 	status = _WaitForEstablished(locker, absoluteTimeout);
601 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
602 		strerror(status), timeout);
603 	return posix_error(status);
604 }
605 
606 
607 status_t
608 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
609 {
610 	TRACE("Accept()");
611 
612 	MutexLocker locker(fLock);
613 
614 	status_t status;
615 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
616 	if (gStackModule->is_restarted_syscall())
617 		timeout = gStackModule->restore_syscall_restart_timeout();
618 	else
619 		gStackModule->store_syscall_restart_timeout(timeout);
620 
621 	do {
622 		locker.Unlock();
623 
624 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
625 			| B_CAN_INTERRUPT, timeout);
626 		if (status < B_OK)
627 			return status;
628 
629 		locker.Lock();
630 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
631 #ifdef TRACE_TCP
632 		if (status == B_OK)
633 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
634 #endif
635 	} while (status < B_OK);
636 
637 	return status;
638 }
639 
640 
641 status_t
642 TCPEndpoint::Bind(const sockaddr *address)
643 {
644 	if (address == NULL)
645 		return B_BAD_VALUE;
646 
647 	MutexLocker lock(fLock);
648 
649 	TRACE("Bind() on address %s", PrintAddress(address));
650 
651 	if (fState != CLOSED)
652 		return EISCONN;
653 
654 	return fManager->Bind(this, address);
655 }
656 
657 
658 status_t
659 TCPEndpoint::Unbind(struct sockaddr *address)
660 {
661 	TRACE("Unbind()");
662 
663 	MutexLocker lock(fLock);
664 	return fManager->Unbind(this);
665 }
666 
667 
668 status_t
669 TCPEndpoint::Listen(int count)
670 {
671 	TRACE("Listen()");
672 
673 	MutexLocker lock(fLock);
674 
675 	if (fState != CLOSED)
676 		return B_BAD_VALUE;
677 
678 	fAcceptSemaphore = create_sem(0, "tcp accept");
679 	if (fAcceptSemaphore < B_OK)
680 		return ENOBUFS;
681 
682 	status_t status = fManager->SetPassive(this);
683 	if (status < B_OK) {
684 		delete_sem(fAcceptSemaphore);
685 		fAcceptSemaphore = -1;
686 		return status;
687 	}
688 
689 	fState = LISTEN;
690 	T(State(this));
691 	return B_OK;
692 }
693 
694 
695 status_t
696 TCPEndpoint::Shutdown(int direction)
697 {
698 	TRACE("Shutdown(%i)", direction);
699 
700 	MutexLocker lock(fLock);
701 
702 	if (direction == SHUT_RD || direction == SHUT_RDWR)
703 		fFlags |= FLAG_NO_RECEIVE;
704 
705 	if (direction == SHUT_WR || direction == SHUT_RDWR)
706 		_Disconnect(false);
707 
708 	return B_OK;
709 }
710 
711 
712 /*!	Puts data contained in \a buffer into send buffer */
713 status_t
714 TCPEndpoint::SendData(net_buffer *buffer)
715 {
716 	MutexLocker lock(fLock);
717 
718 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
719 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
720 		  fSendQueue.Free());
721 
722 	if (fState == CLOSED)
723 		return ENOTCONN;
724 	if (fState == LISTEN)
725 		return EDESTADDRREQ;
726 	if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED
727 		|| fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
728 		|| fState == TIME_WAIT) {
729 		// we only send signals when called from userland
730 		if (gStackModule->is_syscall)
731 			send_signal(find_thread(NULL), SIGPIPE);
732 		return EPIPE;
733 	}
734 
735 	uint32 flags = buffer->flags;
736 	size_t left = buffer->size;
737 
738 	bigtime_t timeout = absolute_timeout(socket->send.timeout);
739 	if (gStackModule->is_restarted_syscall())
740 		timeout = gStackModule->restore_syscall_restart_timeout();
741 	else
742 		gStackModule->store_syscall_restart_timeout(timeout);
743 
744 	while (left > 0) {
745 		while (fSendQueue.Free() < socket->send.low_water_mark) {
746 			// wait until enough space is available
747 			status_t status = fSendList.Wait(lock, timeout);
748 			if (status < B_OK) {
749 				TRACE("  SendData() returning %s (%d)",
750 					strerror(posix_error(status)), (int)posix_error(status));
751 				return posix_error(status);
752 			}
753 		}
754 
755 		size_t size = fSendQueue.Free();
756 		if (size < left) {
757 			// we need to split the original buffer
758 			net_buffer* clone = gBufferModule->clone(buffer, false);
759 				// TODO: add offset/size parameter to net_buffer::clone() or
760 				// even a move_data() function, as this is a bit inefficient
761 			if (clone == NULL)
762 				return ENOBUFS;
763 
764 			status_t status = gBufferModule->trim(clone, size);
765 			if (status != B_OK) {
766 				gBufferModule->free(clone);
767 				return status;
768 			}
769 
770 			gBufferModule->remove_header(buffer, size);
771 			left -= size;
772 			fSendQueue.Add(clone);
773 		} else {
774 			left -= buffer->size;
775 			fSendQueue.Add(buffer);
776 		}
777 	}
778 
779 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
780 
781 	bool force = false;
782 	if ((flags & MSG_OOB) != 0) {
783 		fSendUrgentOffset = fSendQueue.LastSequence();
784 			// RFC 961 specifies that the urgent offset points to the last
785 			// byte of urgent data. However, this is commonly implemented as
786 			// here, ie. it points to the first byte after the urgent data.
787 		force = true;
788 	}
789 	if ((flags & MSG_EOF) != 0)
790 		_Disconnect(false);
791 
792 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
793 		_SendQueued(force);
794 
795 	return B_OK;
796 }
797 
798 
799 ssize_t
800 TCPEndpoint::SendAvailable()
801 {
802 	MutexLocker locker(fLock);
803 
804 	ssize_t available;
805 
806 	if (is_writable(fState))
807 		available = fSendQueue.Free();
808 	else
809 		available = EPIPE;
810 
811 	TRACE("SendAvailable(): %li", available);
812 	return available;
813 }
814 
815 
816 status_t
817 TCPEndpoint::FillStat(net_stat *stat)
818 {
819 	MutexLocker _(fLock);
820 
821 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
822 	stat->receive_queue_size = fReceiveQueue.Available();
823 	stat->send_queue_size = fSendQueue.Used();
824 
825 	return B_OK;
826 }
827 
828 
829 status_t
830 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
831 {
832 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
833 
834 	MutexLocker locker(fLock);
835 
836 	*_buffer = NULL;
837 
838 	if (fState == CLOSED)
839 		return ENOTCONN;
840 
841 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
842 	if (gStackModule->is_restarted_syscall())
843 		timeout = gStackModule->restore_syscall_restart_timeout();
844 	else
845 		gStackModule->store_syscall_restart_timeout(timeout);
846 
847 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
848 		if (flags & MSG_DONTWAIT)
849 			return B_WOULD_BLOCK;
850 
851 		status_t status = _WaitForEstablished(locker, timeout);
852 		if (status < B_OK)
853 			return posix_error(status);
854 	}
855 
856 	size_t dataNeeded = socket->receive.low_water_mark;
857 
858 	// When MSG_WAITALL is set then the function should block
859 	// until the full amount of data can be returned.
860 	if (flags & MSG_WAITALL)
861 		dataNeeded = numBytes;
862 
863 	// TODO: add support for urgent data (MSG_OOB)
864 
865 	while (true) {
866 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
867 			|| fState == TIME_WAIT) {
868 			// ``Connection closing''.
869 			return B_OK;
870 		}
871 
872 		if (fReceiveQueue.Available() > 0) {
873 			if (fReceiveQueue.Available() >= dataNeeded
874 				|| (fReceiveQueue.PushedData() > 0
875 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
876 				break;
877 		} else if (fState == FINISH_RECEIVED) {
878 			// ``If no text is awaiting delivery, the RECEIVE will
879 			//   get a Connection closing''.
880 			return B_OK;
881 		}
882 
883 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
884 			return B_WOULD_BLOCK;
885 
886 		status_t status = fReceiveList.Wait(locker, timeout);
887 		if (status < B_OK) {
888 			// The Open Group base specification mentions that EINTR should be
889 			// returned if the recv() is interrupted before _any data_ is
890 			// available. So we actually check if there is data, and if so,
891 			// push it to the user.
892 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
893 				&& fReceiveQueue.Available() > 0)
894 				break;
895 
896 			return posix_error(status);
897 		}
898 	}
899 
900 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
901 
902 	if (numBytes < fReceiveQueue.Available())
903 		fReceiveList.Signal();
904 
905 	bool clone = (flags & MSG_PEEK) != 0;
906 
907 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
908 
909 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
910 
911 	// if we are opening the window, check if we should send an ACK
912 	if (!clone)
913 		SendAcknowledge(false);
914 
915 	return receivedBytes;
916 }
917 
918 
919 ssize_t
920 TCPEndpoint::ReadAvailable()
921 {
922 	MutexLocker locker(fLock);
923 
924 	TRACE("ReadAvailable(): %li", _AvailableData());
925 
926 	return _AvailableData();
927 }
928 
929 
930 status_t
931 TCPEndpoint::SetSendBufferSize(size_t length)
932 {
933 	MutexLocker _(fLock);
934 	fSendQueue.SetMaxBytes(length);
935 	return B_OK;
936 }
937 
938 
939 status_t
940 TCPEndpoint::SetReceiveBufferSize(size_t length)
941 {
942 	MutexLocker _(fLock);
943 	fReceiveQueue.SetMaxBytes(length);
944 	return B_OK;
945 }
946 
947 
948 status_t
949 TCPEndpoint::GetOption(int option, void* _value, int* _length)
950 {
951 	if (*_length != sizeof(int))
952 		return B_BAD_VALUE;
953 
954 	int* value = (int*)_value;
955 
956 	switch (option) {
957 		case TCP_NODELAY:
958 			if ((fOptions & TCP_NODELAY) != 0)
959 				*value = 1;
960 			else
961 				*value = 0;
962 			return B_OK;
963 
964 		case TCP_MAXSEG:
965 			*value = fReceiveMaxSegmentSize;
966 			return B_OK;
967 
968 		default:
969 			return B_BAD_VALUE;
970 	}
971 }
972 
973 
974 status_t
975 TCPEndpoint::SetOption(int option, const void* _value, int length)
976 {
977 	if (option != TCP_NODELAY)
978 		return B_BAD_VALUE;
979 
980 	if (length != sizeof(int))
981 		return B_BAD_VALUE;
982 
983 	const int* value = (const int*)_value;
984 
985 	MutexLocker _(fLock);
986 	if (*value)
987 		fOptions |= TCP_NODELAY;
988 	else
989 		fOptions &= ~TCP_NODELAY;
990 
991 	return B_OK;
992 }
993 
994 
995 //	#pragma mark - misc
996 
997 
998 bool
999 TCPEndpoint::IsBound() const
1000 {
1001 	return !LocalAddress().IsEmpty(true);
1002 }
1003 
1004 
1005 status_t
1006 TCPEndpoint::DelayedAcknowledge()
1007 {
1008 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1009 		// timer was active, send an ACK now (with the exception above,
1010 		// we send every other ACK)
1011 		return SendAcknowledge(true);
1012 	}
1013 
1014 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1015 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1016 	return B_OK;
1017 }
1018 
1019 
1020 status_t
1021 TCPEndpoint::SendAcknowledge(bool force)
1022 {
1023 	return _SendQueued(force, 0);
1024 }
1025 
1026 
1027 void
1028 TCPEndpoint::_StartPersistTimer()
1029 {
1030 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
1031 }
1032 
1033 
1034 void
1035 TCPEndpoint::_EnterTimeWait()
1036 {
1037 	TRACE("_EnterTimeWait()\n");
1038 
1039 	_CancelConnectionTimers();
1040 
1041 	if (fState == TIME_WAIT && fRoute != NULL
1042 		&& (fRoute->flags & RTF_LOCAL) != 0) {
1043 		// we do not use TIME_WAIT state for local connections
1044 		fFlags |= FLAG_DELETE_ON_CLOSE;
1045 		return;
1046 	}
1047 
1048 	_UpdateTimeWait();
1049 }
1050 
1051 
1052 void
1053 TCPEndpoint::_UpdateTimeWait()
1054 {
1055 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1056 }
1057 
1058 
1059 void
1060 TCPEndpoint::_CancelConnectionTimers()
1061 {
1062 	gStackModule->cancel_timer(&fRetransmitTimer);
1063 	gStackModule->cancel_timer(&fPersistTimer);
1064 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1065 }
1066 
1067 
1068 /*!	Sends the FIN flag to the peer when the connection is still open.
1069 	Moves the endpoint to the next state depending on where it was.
1070 */
1071 status_t
1072 TCPEndpoint::_Disconnect(bool closing)
1073 {
1074 	tcp_state previousState = fState;
1075 
1076 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1077 		fState = FINISH_SENT;
1078 	else if (fState == FINISH_RECEIVED)
1079 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1080 	else
1081 		return B_OK;
1082 
1083 	T(State(this));
1084 
1085 	status_t status = _SendQueued();
1086 	if (status != B_OK) {
1087 		fState = previousState;
1088 		T(State(this));
1089 		return status;
1090 	}
1091 
1092 	return B_OK;
1093 }
1094 
1095 
1096 void
1097 TCPEndpoint::_MarkEstablished()
1098 {
1099 	fState = ESTABLISHED;
1100 	T(State(this));
1101 
1102 	if (socket->parent != NULL) {
1103 		gSocketModule->set_connected(socket);
1104 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1105 	}
1106 
1107 	fSendList.Signal();
1108 }
1109 
1110 
1111 status_t
1112 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1113 {
1114 	while (fState != ESTABLISHED) {
1115 		if (socket->error != B_OK)
1116 			return socket->error;
1117 
1118 		status_t status = fSendList.Wait(locker, timeout);
1119 		if (status < B_OK)
1120 			return status;
1121 	}
1122 
1123 	return B_OK;
1124 }
1125 
1126 
1127 //	#pragma mark - receive
1128 
1129 
1130 void
1131 TCPEndpoint::_Close()
1132 {
1133 	_CancelConnectionTimers();
1134 	fState = CLOSED;
1135 	T(State(this));
1136 
1137 	fFlags |= FLAG_DELETE_ON_CLOSE;
1138 }
1139 
1140 
1141 void
1142 TCPEndpoint::_HandleReset(status_t error)
1143 {
1144 	socket->error = error;
1145 	_Close();
1146 
1147 	fSendList.Signal();
1148 	_NotifyReader();
1149 
1150 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1151 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1152 }
1153 
1154 
1155 void
1156 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1157 {
1158 	if (++fDuplicateAcknowledgeCount < 3)
1159 		return;
1160 
1161 	if (fDuplicateAcknowledgeCount == 3) {
1162 		_ResetSlowStart();
1163 		fCongestionWindow = fSlowStartThreshold + 3
1164 			* fSendMaxSegmentSize;
1165 		fSendNext = segment.acknowledge;
1166 	} else if (fDuplicateAcknowledgeCount > 3)
1167 		fCongestionWindow += fSendMaxSegmentSize;
1168 
1169 	_SendQueued();
1170 }
1171 
1172 
1173 void
1174 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1175 	size_t segmentLength)
1176 {
1177 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1178 		tcp_sequence sequence(segment.sequence);
1179 
1180 		if (fLastAcknowledgeSent >= sequence
1181 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1182 			fReceivedTimestamp = segment.timestamp_value;
1183 	}
1184 }
1185 
1186 
1187 ssize_t
1188 TCPEndpoint::_AvailableData() const
1189 {
1190 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1191 	//       the application of FLAG_NO_RECEIVE in listen()ing
1192 	//       sockets.
1193 	if (fState == LISTEN)
1194 		return gSocketModule->count_connected(socket);
1195 	if (fState == SYNCHRONIZE_SENT)
1196 		return 0;
1197 
1198 	ssize_t availableData = fReceiveQueue.Available();
1199 
1200 	if (availableData == 0 && !_ShouldReceive())
1201 		return ENOTCONN;
1202 
1203 	return availableData;
1204 }
1205 
1206 
1207 void
1208 TCPEndpoint::_NotifyReader()
1209 {
1210 	fReceiveList.Signal();
1211 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1212 }
1213 
1214 
1215 bool
1216 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1217 {
1218 	fReceiveQueue.Add(buffer, segment.sequence);
1219 	fReceiveNext = fReceiveQueue.NextSequence();
1220 
1221 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1222 		(uint32)fReceiveNext, fReceiveQueue.Available());
1223 
1224 	if (segment.flags & TCP_FLAG_PUSH)
1225 		fReceiveQueue.SetPushPointer();
1226 
1227 	return fReceiveQueue.Available() > 0;
1228 }
1229 
1230 
1231 void
1232 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1233 {
1234 	fInitialReceiveSequence = segment.sequence;
1235 
1236 	// count the received SYN
1237 	segment.sequence++;
1238 
1239 	fReceiveNext = segment.sequence;
1240 	fReceiveQueue.SetInitialSequence(segment.sequence);
1241 
1242 	if ((fOptions & TCP_NOOPT) == 0) {
1243 		if (segment.max_segment_size > 0)
1244 			fSendMaxSegmentSize = segment.max_segment_size;
1245 
1246 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1247 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1248 			fSendWindowShift = segment.window_shift;
1249 		} else {
1250 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1251 			fReceiveWindowShift = 0;
1252 		}
1253 
1254 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1255 			fFlags |= FLAG_OPTION_TIMESTAMP;
1256 			fReceivedTimestamp = segment.timestamp_value;
1257 		} else
1258 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1259 	}
1260 
1261 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1262 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1263 }
1264 
1265 
1266 bool
1267 TCPEndpoint::_ShouldReceive() const
1268 {
1269 	if (fFlags & FLAG_NO_RECEIVE)
1270 		return false;
1271 
1272 	return fState == ESTABLISHED || fState == FINISH_SENT
1273 		|| fState == FINISH_ACKNOWLEDGED;
1274 }
1275 
1276 
1277 int32
1278 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1279 	net_buffer* buffer)
1280 {
1281 	MutexLocker _(fLock);
1282 
1283 	// TODO error checking
1284 	ProtocolSocket::Open();
1285 
1286 	fState = SYNCHRONIZE_RECEIVED;
1287 	T(State(this));
1288 	fManager = parent->fManager;
1289 
1290 	LocalAddress().SetTo(buffer->destination);
1291 	PeerAddress().SetTo(buffer->source);
1292 
1293 	TRACE("Spawn()");
1294 
1295 	// TODO: proper error handling!
1296 	if (fManager->BindChild(this) < B_OK)
1297 		return DROP;
1298 
1299 	if (_PrepareSendPath(*PeerAddress()) < B_OK)
1300 		return DROP;
1301 
1302 	fOptions = parent->fOptions;
1303 	fAcceptSemaphore = parent->fAcceptSemaphore;
1304 
1305 	_PrepareReceivePath(segment);
1306 
1307 	// send SYN+ACK
1308 	if (_SendQueued() < B_OK)
1309 		return DROP;
1310 
1311 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1312 		// we handled this flag now, it must not be set for further processing
1313 
1314 	return _Receive(segment, buffer);
1315 }
1316 
1317 
1318 int32
1319 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1320 {
1321 	TRACE("ListenReceive()");
1322 
1323 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1324 	// but the error behaviour differs
1325 	if (segment.flags & TCP_FLAG_RESET)
1326 		return DROP;
1327 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1328 		return DROP | RESET;
1329 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1330 		return DROP;
1331 
1332 	// TODO: drop broadcast/multicast
1333 
1334 	// spawn new endpoint for accept()
1335 	net_socket *newSocket;
1336 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK)
1337 		return DROP;
1338 
1339 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1340 		segment, buffer);
1341 }
1342 
1343 
1344 int32
1345 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1346 	net_buffer *buffer)
1347 {
1348 	TRACE("_SynchronizeSentReceive()");
1349 
1350 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1351 		&& (fInitialSendSequence >= segment.acknowledge
1352 			|| fSendMax < segment.acknowledge))
1353 		return DROP | RESET;
1354 
1355 	if (segment.flags & TCP_FLAG_RESET) {
1356 		_HandleReset(ECONNREFUSED);
1357 		return DROP;
1358 	}
1359 
1360 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1361 		return DROP;
1362 
1363 	fSendUnacknowledged = segment.acknowledge;
1364 	_PrepareReceivePath(segment);
1365 
1366 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1367 		_MarkEstablished();
1368 	} else {
1369 		// simultaneous open
1370 		fState = SYNCHRONIZE_RECEIVED;
1371 		T(State(this));
1372 	}
1373 
1374 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1375 		// we handled this flag now, it must not be set for further processing
1376 
1377 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1378 }
1379 
1380 
1381 int32
1382 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1383 {
1384 	uint32 advertisedWindow = (uint32)segment.advertised_window
1385 		<< fSendWindowShift;
1386 	size_t segmentLength = buffer->size;
1387 
1388 	// First, handle the most common case for uni-directional data transfer
1389 	// (known as header prediction - the segment must not change the window,
1390 	// and must be the expected sequence, and contain no control flags)
1391 
1392 	if (fState == ESTABLISHED
1393 		&& segment.AcknowledgeOnly()
1394 		&& fReceiveNext == segment.sequence
1395 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1396 		&& fSendNext == fSendMax) {
1397 		_UpdateTimestamps(segment, segmentLength);
1398 
1399 		if (segmentLength == 0) {
1400 			// this is a pure acknowledge segment - we're on the sending end
1401 			if (fSendUnacknowledged < segment.acknowledge
1402 				&& fSendMax >= segment.acknowledge) {
1403 				_Acknowledged(segment);
1404 				return DROP;
1405 			}
1406 		} else if (segment.acknowledge == fSendUnacknowledged
1407 			&& fReceiveQueue.IsContiguous()
1408 			&& fReceiveQueue.Free() >= segmentLength
1409 			&& !(fFlags & FLAG_NO_RECEIVE)) {
1410 			if (_AddData(segment, buffer))
1411 				_NotifyReader();
1412 
1413 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1414 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1415 		}
1416 	}
1417 
1418 	// The fast path was not applicable, so we continue with the standard
1419 	// processing of the incoming segment
1420 
1421 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1422 
1423 	if (fState != CLOSED && fState != TIME_WAIT) {
1424 		// Check sequence number
1425 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1426 				fReceiveWindow)) {
1427 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1428 				(uint32)fReceiveNext, fReceiveWindow);
1429 			if (segment.flags & TCP_FLAG_RESET) {
1430 				// TODO: this doesn't look right - review!
1431 				return DROP;
1432 			}
1433 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1434 		}
1435 	}
1436 
1437 	if (segment.flags & TCP_FLAG_RESET) {
1438 		// Is this a valid reset?
1439 		// We generally ignore resets in time wait state (see RFC 1337)
1440 		if (fLastAcknowledgeSent <= segment.sequence
1441 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1442 				+ fReceiveWindow)
1443 			&& fState != TIME_WAIT) {
1444 			status_t error;
1445 			if (fState == SYNCHRONIZE_RECEIVED)
1446 				error = ECONNREFUSED;
1447 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1448 				error = ENOTCONN;
1449 			else
1450 				error = ECONNRESET;
1451 
1452 			_HandleReset(error);
1453 		}
1454 
1455 		return DROP;
1456 	}
1457 
1458 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1459 		|| (fState == SYNCHRONIZE_RECEIVED
1460 			&& (fInitialReceiveSequence > segment.sequence
1461 				|| (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1462 					&& (fSendUnacknowledged > segment.acknowledge
1463 						|| fSendMax < segment.acknowledge)))) {
1464 		// reset the connection - either the initial SYN was faulty, or we
1465 		// received a SYN within the data stream
1466 		return DROP | RESET;
1467 	}
1468 
1469 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1470 		// the window must not shrink
1471 
1472 	// trim buffer to be within the receive window
1473 	int32 drop = fReceiveNext - segment.sequence;
1474 	if (drop > 0) {
1475 		if ((uint32)drop > buffer->size
1476 			|| ((uint32)drop == buffer->size
1477 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1478 			// don't accidently remove a FIN we shouldn't remove
1479 			segment.flags &= ~TCP_FLAG_FINISH;
1480 			drop = buffer->size;
1481 		}
1482 
1483 		// remove duplicate data at the start
1484 		TRACE("* remove %ld bytes from the start", drop);
1485 		gBufferModule->remove_header(buffer, drop);
1486 		segment.sequence += drop;
1487 	}
1488 
1489 	int32 action = KEEP;
1490 
1491 	drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow);
1492 	if (drop > 0) {
1493 		// remove data exceeding our window
1494 		if ((uint32)drop >= buffer->size) {
1495 			// if we can accept data, or the segment is not what we'd expect,
1496 			// drop the segment (an immediate acknowledge is always triggered)
1497 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1498 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1499 
1500 			action |= IMMEDIATE_ACKNOWLEDGE;
1501 		}
1502 
1503 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1504 			// we need to remove the finish, too, as part of the data
1505 			drop--;
1506 		}
1507 
1508 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1509 		TRACE("* remove %ld bytes from the end", drop);
1510 		gBufferModule->remove_trailer(buffer, drop);
1511 	}
1512 
1513 #ifdef TRACE_TCP
1514 	if (advertisedWindow > fSendWindow) {
1515 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1516 			advertisedWindow);
1517 	}
1518 #endif
1519 
1520 	fSendWindow = advertisedWindow;
1521 	if (advertisedWindow > fSendMaxWindow)
1522 		fSendMaxWindow = advertisedWindow;
1523 
1524 	// Then look at the acknowledgement for any updates
1525 
1526 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1527 		// process acknowledged data
1528 		if (fState == SYNCHRONIZE_RECEIVED)
1529 			_MarkEstablished();
1530 
1531 		if (fSendMax < segment.acknowledge)
1532 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1533 
1534 		if (segment.acknowledge < fSendUnacknowledged) {
1535 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1536 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1537 				TRACE("Receive(): duplicate ack!");
1538 
1539 				_DuplicateAcknowledge(segment);
1540 			}
1541 
1542 			return DROP;
1543 		} else {
1544 			// this segment acknowledges in flight data
1545 
1546 			if (fDuplicateAcknowledgeCount >= 3) {
1547 				// deflate the window.
1548 				fCongestionWindow = fSlowStartThreshold;
1549 			}
1550 
1551 			fDuplicateAcknowledgeCount = 0;
1552 
1553 			if (fSendMax == segment.acknowledge)
1554 				TRACE("Receive(): all inflight data ack'd!");
1555 
1556 			if (segment.acknowledge > fSendQueue.LastSequence()
1557 					&& fState > ESTABLISHED) {
1558 				TRACE("Receive(): FIN has been acknowledged!");
1559 
1560 				switch (fState) {
1561 					case FINISH_SENT:
1562 						fState = FINISH_ACKNOWLEDGED;
1563 						T(State(this));
1564 						break;
1565 					case CLOSING:
1566 						fState = TIME_WAIT;
1567 						T(State(this));
1568 						_EnterTimeWait();
1569 						return DROP;
1570 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1571 						_Close();
1572 						break;
1573 
1574 					default:
1575 						break;
1576 				}
1577 			}
1578 
1579 			if (fState != CLOSED)
1580 				_Acknowledged(segment);
1581 		}
1582 	}
1583 
1584 	if (segment.flags & TCP_FLAG_URGENT) {
1585 		if (fState == ESTABLISHED || fState == FINISH_SENT
1586 			|| fState == FINISH_ACKNOWLEDGED) {
1587 			// TODO: Handle urgent data:
1588 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1589 			//  - signal the user that urgent data is available (SIGURG)
1590 		}
1591 	}
1592 
1593 	bool notify = false;
1594 
1595 	if (buffer->size > 0 &&	_ShouldReceive())
1596 		notify = _AddData(segment, buffer);
1597 	else
1598 		action = (action & ~KEEP) | DROP;
1599 
1600 	if (segment.flags & TCP_FLAG_FINISH) {
1601 		segmentLength++;
1602 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1603 			TRACE("Receive(): peer is finishing connection!");
1604 			fReceiveNext++;
1605 			notify = true;
1606 
1607 			// FIN implies PSH
1608 			fReceiveQueue.SetPushPointer();
1609 
1610 			// we'll reply immediately to the FIN if we are not
1611 			// transitioning to TIME WAIT so we immediatly ACK it.
1612 			action |= IMMEDIATE_ACKNOWLEDGE;
1613 
1614 			// other side is closing connection; change states
1615 			switch (fState) {
1616 				case ESTABLISHED:
1617 				case SYNCHRONIZE_RECEIVED:
1618 					fState = FINISH_RECEIVED;
1619 					T(State(this));
1620 					break;
1621 				case FINISH_SENT:
1622 					// simultaneous close
1623 					fState = CLOSING;
1624 					T(State(this));
1625 					break;
1626 				case FINISH_ACKNOWLEDGED:
1627 					fState = TIME_WAIT;
1628 					T(State(this));
1629 					_EnterTimeWait();
1630 					break;
1631 				case TIME_WAIT:
1632 					_UpdateTimeWait();
1633 					break;
1634 
1635 				default:
1636 					break;
1637 			}
1638 		}
1639 	}
1640 
1641 	if (notify)
1642 		_NotifyReader();
1643 
1644 	if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1645 		action |= ACKNOWLEDGE;
1646 
1647 	_UpdateTimestamps(segment, segmentLength);
1648 
1649 	TRACE("Receive() Action %ld", action);
1650 
1651 	return action;
1652 }
1653 
1654 
1655 int32
1656 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1657 {
1658 	MutexLocker locker(fLock);
1659 
1660 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n"
1661 		"\tflags 0x%x, seq %lu, ack %lu, wnd %lu",
1662 		buffer, buffer->size, PrintAddress(buffer->source),
1663 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1664 		segment.acknowledge,
1665 		(uint32)segment.advertised_window << fSendWindowShift);
1666 	T(Receive(this, segment,
1667 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1668 	int32 segmentAction = DROP;
1669 
1670 	switch (fState) {
1671 		case LISTEN:
1672 			segmentAction = _ListenReceive(segment, buffer);
1673 			break;
1674 
1675 		case SYNCHRONIZE_SENT:
1676 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1677 			break;
1678 
1679 		case SYNCHRONIZE_RECEIVED:
1680 		case ESTABLISHED:
1681 		case FINISH_RECEIVED:
1682 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1683 		case FINISH_SENT:
1684 		case FINISH_ACKNOWLEDGED:
1685 		case CLOSING:
1686 		case TIME_WAIT:
1687 		case CLOSED:
1688 			segmentAction = _Receive(segment, buffer);
1689 			break;
1690 	}
1691 
1692 	// process acknowledge action as asked for by the *Receive() method
1693 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1694 		SendAcknowledge(true);
1695 	else if (segmentAction & ACKNOWLEDGE)
1696 		DelayedAcknowledge();
1697 
1698 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1699 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1700 		locker.Unlock();
1701 		gSocketModule->delete_socket(socket);
1702 			// this will also delete us
1703 	}
1704 
1705 	return segmentAction;
1706 }
1707 
1708 
1709 //	#pragma mark - send
1710 
1711 
1712 inline uint8
1713 TCPEndpoint::_CurrentFlags()
1714 {
1715 	// we don't set FLAG_FINISH here, instead we do it
1716 	// conditionally below depending if we are sending
1717 	// the last bytes of the send queue.
1718 
1719 	switch (fState) {
1720 		case CLOSED:
1721 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1722 
1723 		case SYNCHRONIZE_SENT:
1724 			return TCP_FLAG_SYNCHRONIZE;
1725 		case SYNCHRONIZE_RECEIVED:
1726 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1727 
1728 		case ESTABLISHED:
1729 		case FINISH_RECEIVED:
1730 		case FINISH_ACKNOWLEDGED:
1731 		case TIME_WAIT:
1732 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1733 		case FINISH_SENT:
1734 		case CLOSING:
1735 			return TCP_FLAG_ACKNOWLEDGE;
1736 
1737 		default:
1738 			return 0;
1739 	}
1740 }
1741 
1742 
1743 inline bool
1744 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1745 	uint32 segmentMaxSize, uint32 flightSize)
1746 {
1747 	if (length > 0) {
1748 		// Avoid the silly window syndrome - we only send a segment in case:
1749 		// - we have a full segment to send, or
1750 		// - we're at the end of our buffer queue, or
1751 		// - the buffer is at least larger than half of the maximum send window,
1752 		//   or
1753 		// - we're retransmitting data
1754 		if (length == segmentMaxSize
1755 			|| (fOptions & TCP_NODELAY) != 0
1756 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1757 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1758 			return true;
1759 	}
1760 
1761 	// check if we need to send a window update to the peer
1762 	if (segment.advertised_window > 0) {
1763 		// correct the window to take into account what already has been advertised
1764 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1765 			- (fReceiveMaxAdvertised - fReceiveNext);
1766 
1767 		// if we can advertise a window larger than twice the maximum segment
1768 		// size, or half the maximum buffer size we send a window update
1769 		if (window >= (fReceiveMaxSegmentSize << 1)
1770 			|| window >= (socket->receive.buffer_size >> 1))
1771 			return true;
1772 	}
1773 
1774 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1775 			| TCP_FLAG_RESET)) != 0)
1776 		return true;
1777 
1778 	// We do have urgent data pending
1779 	if (fSendUrgentOffset > fSendNext)
1780 		return true;
1781 
1782 	// there is no reason to send a segment just now
1783 	return false;
1784 }
1785 
1786 
1787 status_t
1788 TCPEndpoint::_SendQueued(bool force)
1789 {
1790 	return _SendQueued(force, fSendWindow);
1791 }
1792 
1793 
1794 /*!	Sends one or more TCP segments with the data waiting in the queue, or some
1795 	specific flags that need to be sent.
1796 */
1797 status_t
1798 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1799 {
1800 	if (fRoute == NULL)
1801 		return B_ERROR;
1802 
1803 	// in passive state?
1804 	if (fState == LISTEN)
1805 		return B_ERROR;
1806 
1807 	tcp_segment_header segment(_CurrentFlags());
1808 
1809 	if ((fOptions & TCP_NOOPT) == 0) {
1810 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1811 			segment.options |= TCP_HAS_TIMESTAMPS;
1812 			segment.timestamp_reply = fReceivedTimestamp;
1813 			segment.timestamp_value = tcp_now();
1814 		}
1815 
1816 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1817 			&& fSendNext == fInitialSendSequence) {
1818 			// add connection establishment options
1819 			segment.max_segment_size = fReceiveMaxSegmentSize;
1820 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1821 				segment.options |= TCP_HAS_WINDOW_SCALE;
1822 				segment.window_shift = fReceiveWindowShift;
1823 			}
1824 		}
1825 	}
1826 
1827 	size_t availableBytes = fReceiveQueue.Free();
1828 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1829 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1830 	else
1831 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1832 
1833 	segment.acknowledge = fReceiveNext;
1834 
1835 	// Process urgent data
1836 	if (fSendUrgentOffset > fSendNext) {
1837 		segment.flags |= TCP_FLAG_URGENT;
1838 		segment.urgent_offset = fSendUrgentOffset - fSendNext;
1839 	} else {
1840 		fSendUrgentOffset = fSendUnacknowledged;
1841 			// Keep urgent offset updated, so that it doesn't reach into our
1842 			// send window on overlap
1843 		segment.urgent_offset = 0;
1844 	}
1845 
1846 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1847 		sendWindow = fCongestionWindow;
1848 
1849 	// fSendUnacknowledged
1850 	//  |    fSendNext      fSendMax
1851 	//  |        |              |
1852 	//  v        v              v
1853 	//  -----------------------------------
1854 	//  | effective window           |
1855 	//  -----------------------------------
1856 
1857 	// Flight size represents the window of data which is currently in the
1858 	// ether. We should never send data such as the flight size becomes larger
1859 	// than the effective window. Note however that the effective window may be
1860 	// reduced (by congestion for instance), so at some point in time flight
1861 	// size may be larger than the currently calculated window.
1862 
1863 	uint32 flightSize = fSendMax - fSendUnacknowledged;
1864 	uint32 consumedWindow = fSendNext - fSendUnacknowledged;
1865 
1866 	if (consumedWindow > sendWindow) {
1867 		sendWindow = 0;
1868 		// TODO enter persist state? try to get a window update.
1869 	} else
1870 		sendWindow -= consumedWindow;
1871 
1872 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1873 		// send one byte of data to ask for a window update
1874 		// (triggered by the persist timer)
1875 		sendWindow = 1;
1876 	}
1877 
1878 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1879 	tcp_sequence previousSendNext = fSendNext;
1880 
1881 	do {
1882 		uint32 segmentMaxSize = fSendMaxSegmentSize
1883 			- tcp_options_length(segment);
1884 		uint32 segmentLength = min_c(length, segmentMaxSize);
1885 
1886 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
1887 			if (state_needs_finish(fState))
1888 				segment.flags |= TCP_FLAG_FINISH;
1889 			if (length > 0)
1890 				segment.flags |= TCP_FLAG_PUSH;
1891 		}
1892 
1893 		// Determine if we should really send this segment
1894 		if (!force && !_ShouldSendSegment(segment, segmentLength,
1895 			segmentMaxSize, flightSize)) {
1896 			if (fSendQueue.Available()
1897 				&& !gStackModule->is_timer_active(&fPersistTimer)
1898 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
1899 				_StartPersistTimer();
1900 			break;
1901 		}
1902 
1903 		net_buffer *buffer = gBufferModule->create(256);
1904 		if (buffer == NULL)
1905 			return B_NO_MEMORY;
1906 
1907 		status_t status = B_OK;
1908 		if (segmentLength > 0)
1909 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
1910 		if (status < B_OK) {
1911 			gBufferModule->free(buffer);
1912 			return status;
1913 		}
1914 
1915 		LocalAddress().CopyTo(buffer->source);
1916 		PeerAddress().CopyTo(buffer->destination);
1917 
1918 		uint32 size = buffer->size;
1919 		segment.sequence = fSendNext;
1920 
1921 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n"
1922 			"\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n"
1923 			"\tlen %lu first %lu last %lu",
1924 			buffer, buffer->size, PrintAddress(buffer->source),
1925 			PrintAddress(buffer->destination), segment.flags, segment.sequence,
1926 			segment.acknowledge, segment.advertised_window,
1927 			fCongestionWindow, fSlowStartThreshold, segmentLength,
1928 			(uint32)fSendQueue.FirstSequence(),
1929 			(uint32)fSendQueue.LastSequence());
1930 		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
1931 			fSendQueue.LastSequence()));
1932 
1933 		PROBE(buffer, sendWindow);
1934 		sendWindow -= buffer->size;
1935 
1936 		status = add_tcp_header(AddressModule(), segment, buffer);
1937 		if (status != B_OK) {
1938 			gBufferModule->free(buffer);
1939 			return status;
1940 		}
1941 
1942 		// Update send status - we need to do this before we send the data
1943 		// for local connections as the answer is directly handled
1944 
1945 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
1946 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
1947 			segment.max_segment_size = 0;
1948 			size++;
1949 		}
1950 
1951 		if (segment.flags & TCP_FLAG_FINISH)
1952 			size++;
1953 
1954 		uint32 sendMax = fSendMax;
1955 		fSendNext += size;
1956 		if (fSendMax < fSendNext)
1957 			fSendMax = fSendNext;
1958 
1959 		fReceiveMaxAdvertised = fReceiveNext
1960 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
1961 
1962 		status = next->module->send_routed_data(next, fRoute, buffer);
1963 		if (status < B_OK) {
1964 			gBufferModule->free(buffer);
1965 
1966 			fSendNext = segment.sequence;
1967 			fSendMax = sendMax;
1968 				// restore send status
1969 			return status;
1970 		}
1971 
1972 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1973 			fLastAcknowledgeSent = segment.acknowledge;
1974 
1975 		length -= segmentLength;
1976 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
1977 			| TCP_FLAG_FINISH);
1978 	} while (length > 0);
1979 
1980 	// if we sent data from the beggining of the send queue,
1981 	// start the retransmition timer
1982 	if (previousSendNext == fSendUnacknowledged
1983 		&& fSendNext > previousSendNext) {
1984 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
1985 			fRetransmitTimeout);
1986 
1987 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
1988 	}
1989 
1990 	return B_OK;
1991 }
1992 
1993 
1994 int
1995 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
1996 {
1997 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
1998 }
1999 
2000 
2001 status_t
2002 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2003 {
2004 	if (fRoute == NULL) {
2005 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2006 		if (fRoute == NULL)
2007 			return ENETUNREACH;
2008 	}
2009 
2010 	// make sure connection does not already exist
2011 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2012 		fRoute->interface->address);
2013 	if (status < B_OK)
2014 		return status;
2015 
2016 	fInitialSendSequence = system_time() >> 4;
2017 	fSendNext = fInitialSendSequence;
2018 	fSendUnacknowledged = fInitialSendSequence;
2019 	fSendMax = fInitialSendSequence;
2020 	fSendUrgentOffset = fInitialSendSequence;
2021 
2022 	// we are counting the SYN here
2023 	fSendQueue.SetInitialSequence(fSendNext + 1);
2024 
2025 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2026 
2027 	// Compute the window shift we advertise to our peer - if it doesn't support
2028 	// this option, this will be reset to 0 (when its SYN is received)
2029 	fReceiveWindowShift = 0;
2030 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2031 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2032 		fReceiveWindowShift++;
2033 	}
2034 
2035 	return B_OK;
2036 }
2037 
2038 
2039 void
2040 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2041 {
2042 	size_t previouslyUsed = fSendQueue.Used();
2043 
2044 	fSendQueue.RemoveUntil(segment.acknowledge);
2045 	fSendUnacknowledged = segment.acknowledge;
2046 
2047 	if (fSendNext < fSendUnacknowledged)
2048 		fSendNext = fSendUnacknowledged;
2049 
2050 	if (fSendUnacknowledged == fSendMax)
2051 		gStackModule->cancel_timer(&fRetransmitTimer);
2052 
2053 	if (fSendQueue.Used() < previouslyUsed) {
2054 		// this ACK acknowledged data
2055 
2056 		if (segment.options & TCP_HAS_TIMESTAMPS)
2057 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2058 		else {
2059 			// TODO: Fallback to RFC 793 type estimation
2060 		}
2061 
2062 		if (is_writable(fState)) {
2063 			// notify threads waiting on the socket to become writable again
2064 			fSendList.Signal();
2065 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
2066 		}
2067 
2068 		if (fCongestionWindow < fSlowStartThreshold)
2069 			fCongestionWindow += fSendMaxSegmentSize;
2070 	}
2071 
2072 	if (fCongestionWindow >= fSlowStartThreshold) {
2073 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2074 
2075 		if (increment < fCongestionWindow)
2076 			increment = 1;
2077 		else
2078 			increment /= fCongestionWindow;
2079 
2080 		fCongestionWindow += increment;
2081 	}
2082 
2083 	// if there is data left to be send, send it now
2084 	if (fSendQueue.Used() > 0)
2085 		_SendQueued();
2086 }
2087 
2088 
2089 void
2090 TCPEndpoint::_Retransmit()
2091 {
2092 	TRACE("Retransmit()");
2093 	_ResetSlowStart();
2094 	fSendNext = fSendUnacknowledged;
2095 	_SendQueued();
2096 }
2097 
2098 
2099 void
2100 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2101 {
2102 	int32 rtt = roundTripTime;
2103 
2104 	// "smooth" round trip time as per Van Jacobson
2105 	rtt -= fRoundTripTime / 8;
2106 	fRoundTripTime += rtt;
2107 	if (rtt < 0)
2108 		rtt = -rtt;
2109 	rtt -= fRoundTripDeviation / 4;
2110 	fRoundTripDeviation += rtt;
2111 
2112 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2113 		* kTimestampFactor;
2114 
2115 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
2116 		roundTripTime);
2117 }
2118 
2119 
2120 void
2121 TCPEndpoint::_ResetSlowStart()
2122 {
2123 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2,
2124 		2 * fSendMaxSegmentSize);
2125 	fCongestionWindow = fSendMaxSegmentSize;
2126 }
2127 
2128 
2129 //	#pragma mark - timer
2130 
2131 
2132 /*static*/ void
2133 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2134 {
2135 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2136 	T(Timer(endpoint, "retransmit"));
2137 
2138 	MutexLocker locker(endpoint->fLock);
2139 	if (!locker.IsLocked())
2140 		return;
2141 
2142 	endpoint->_Retransmit();
2143 }
2144 
2145 
2146 /*static*/ void
2147 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2148 {
2149 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2150 	T(Timer(endpoint, "persist"));
2151 
2152 	MutexLocker locker(endpoint->fLock);
2153 	if (!locker.IsLocked())
2154 		return;
2155 
2156 	// the timer might not have been canceled early enough
2157 	if (endpoint->State() == CLOSED)
2158 		return;
2159 
2160 	endpoint->_SendQueued(true);
2161 }
2162 
2163 
2164 /*static*/ void
2165 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2166 {
2167 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2168 	T(Timer(endpoint, "delayed ack"));
2169 
2170 	MutexLocker locker(endpoint->fLock);
2171 	if (!locker.IsLocked())
2172 		return;
2173 
2174 	// the timer might not have been canceled early enough
2175 	if (endpoint->State() == CLOSED)
2176 		return;
2177 
2178 	endpoint->SendAcknowledge(true);
2179 }
2180 
2181 
2182 /*static*/ void
2183 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2184 {
2185 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2186 	T(Timer(endpoint, "time-wait"));
2187 
2188 	MutexLocker locker(endpoint->fLock);
2189 	if (!locker.IsLocked())
2190 		return;
2191 
2192 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2193 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2194 		return;
2195 	}
2196 
2197 	locker.Unlock();
2198 
2199 	gSocketModule->delete_socket(endpoint->socket);
2200 }
2201 
2202 
2203 //	#pragma mark -
2204 
2205 
2206 void
2207 TCPEndpoint::Dump() const
2208 {
2209 	kprintf("TCP endpoint %p\n", this);
2210 	kprintf("  state: %s\n", name_for_state(fState));
2211 	kprintf("  flags: 0x%lx\n", fFlags);
2212 #ifdef KDEBUG
2213 	kprintf("  lock: { %p, holder: %ld }\n", &fLock, fLock.holder);
2214 #endif
2215 	kprintf("  accept sem: %ld\n", fAcceptSemaphore);
2216 	kprintf("  options: 0x%lx\n", (uint32)fOptions);
2217 	kprintf("  send\n");
2218 	kprintf("    window shift: %lu\n", (uint32)fSendWindowShift);
2219 	kprintf("    unacknowledged: %lu\n", (uint32)fSendUnacknowledged);
2220 	kprintf("    next: %lu\n", (uint32)fSendNext);
2221 	kprintf("    max: %lu\n", (uint32)fSendMax);
2222 	kprintf("    urgent offset: %lu\n", (uint32)fSendUrgentOffset);
2223 	kprintf("    window: %lu\n", fSendWindow);
2224 	kprintf("    max window: %lu\n", fSendMaxWindow);
2225 	kprintf("    max segment size: %lu\n", fSendMaxSegmentSize);
2226 	kprintf("    queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
2227 	kprintf("    last acknowledge sent: %lu\n", (uint32)fLastAcknowledgeSent);
2228 	kprintf("    initial sequence: %lu\n", (uint32)fInitialSendSequence);
2229 	kprintf("  receive\n");
2230 	kprintf("    window shift: %lu\n", (uint32)fReceiveWindowShift);
2231 	kprintf("    next: %lu\n", (uint32)fReceiveNext);
2232 	kprintf("    max advertised: %lu\n", (uint32)fReceiveMaxAdvertised);
2233 	kprintf("    window: %lu\n", (uint32)fReceiveWindow);
2234 	kprintf("    max segment size: %lu\n", (uint32)fReceiveMaxSegmentSize);
2235 	kprintf("    queue: %lu / %lu\n", fReceiveQueue.Available(),
2236 		fReceiveQueue.Size());
2237 	kprintf("    initial sequence: %lu\n", (uint32)fInitialReceiveSequence);
2238 	kprintf("    duplicate acknowledge count: %lu\n",
2239 		fDuplicateAcknowledgeCount);
2240 	kprintf("  round trip time: %ld (deviation %ld)\n", fRoundTripTime,
2241 		fRoundTripDeviation);
2242 	kprintf("  retransmit timeout: %llu\n", (uint64)fRetransmitTimeout);
2243 	kprintf("  congestion window: %lu\n", fCongestionWindow);
2244 	kprintf("  slow start threshold: %lu\n", fSlowStartThreshold);
2245 }
2246 
2247